Update aosp/master LLVM for rebase to r222494.

Change-Id: Ic787f5e0124df789bd26f3f24680f45e678eef2d
author: Stephen Hines <srhines@google.com> 2014-12-01 14:51:49 -0800
committer: Stephen Hines <srhines@google.com> 2014-12-02 16:08:10 -0800
commit: 37ed9c199ca639565f6ce88105f9e39e898d82d0 (patch)
tree: 8fb36d3910e3ee4c4e1b7422f4f017108efc52f5 /test/CodeGen
parent: d2327b22152ced7bc46dc629fc908959e8a52d03 (diff)
download: external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.zip
external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.gz
external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.bz2
1146 files changed, 74101 insertions, 17509 deletions
diff --git a/test/CodeGen/AArch64/PBQP-chain.ll b/test/CodeGen/AArch64/PBQP-chain.ll
new file mode 100644
index 0000000..c4ba026
--- /dev/null
+++ b/test/CodeGen/AArch64/PBQP-chain.ll
@@ -0,0 +1,104 @@
+; RUN: llc < %s -mcpu=cortex-a57 -mattr=+neon -fp-contract=fast -regalloc=pbqp -pbqp-coalescing | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a57 -mattr=+neon -fp-contract=fast -regalloc=pbqp -pbqp-coalescing | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
+;
+; Test PBQP is able to fulfill the accumulator chaining constraint.
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; CHECK-LABEL: fir
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+define void @fir(double* nocapture %rx, double* nocapture %ry, double* nocapture %c, double* nocapture %x, double* nocapture %y) {
+entry:
+  %0 = load double* %c, align 8
+  %1 = load double* %x, align 8
+  %mul = fmul fast double %1, %0
+  %2 = load double* %y, align 8
+  %mul7 = fmul fast double %2, %0
+  %arrayidx.1 = getelementptr inbounds double* %c, i64 1
+  %3 = load double* %arrayidx.1, align 8
+  %arrayidx2.1 = getelementptr inbounds double* %x, i64 1
+  %4 = load double* %arrayidx2.1, align 8
+  %mul.1 = fmul fast double %4, %3
+  %add.1 = fadd fast double %mul.1, %mul
+  %arrayidx6.1 = getelementptr inbounds double* %y, i64 1
+  %5 = load double* %arrayidx6.1, align 8
+  %mul7.1 = fmul fast double %5, %3
+  %add8.1 = fadd fast double %mul7.1, %mul7
+  %arrayidx.2 = getelementptr inbounds double* %c, i64 2
+  %6 = load double* %arrayidx.2, align 8
+  %arrayidx2.2 = getelementptr inbounds double* %x, i64 2
+  %7 = load double* %arrayidx2.2, align 8
+  %mul.2 = fmul fast double %7, %6
+  %add.2 = fadd fast double %mul.2, %add.1
+  %arrayidx6.2 = getelementptr inbounds double* %y, i64 2
+  %8 = load double* %arrayidx6.2, align 8
+  %mul7.2 = fmul fast double %8, %6
+  %add8.2 = fadd fast double %mul7.2, %add8.1
+  %arrayidx.3 = getelementptr inbounds double* %c, i64 3
+  %9 = load double* %arrayidx.3, align 8
+  %arrayidx2.3 = getelementptr inbounds double* %x, i64 3
+  %10 = load double* %arrayidx2.3, align 8
+  %mul.3 = fmul fast double %10, %9
+  %add.3 = fadd fast double %mul.3, %add.2
+  %arrayidx6.3 = getelementptr inbounds double* %y, i64 3
+  %11 = load double* %arrayidx6.3, align 8
+  %mul7.3 = fmul fast double %11, %9
+  %add8.3 = fadd fast double %mul7.3, %add8.2
+  %arrayidx.4 = getelementptr inbounds double* %c, i64 4
+  %12 = load double* %arrayidx.4, align 8
+  %arrayidx2.4 = getelementptr inbounds double* %x, i64 4
+  %13 = load double* %arrayidx2.4, align 8
+  %mul.4 = fmul fast double %13, %12
+  %add.4 = fadd fast double %mul.4, %add.3
+  %arrayidx6.4 = getelementptr inbounds double* %y, i64 4
+  %14 = load double* %arrayidx6.4, align 8
+  %mul7.4 = fmul fast double %14, %12
+  %add8.4 = fadd fast double %mul7.4, %add8.3
+  %arrayidx.5 = getelementptr inbounds double* %c, i64 5
+  %15 = load double* %arrayidx.5, align 8
+  %arrayidx2.5 = getelementptr inbounds double* %x, i64 5
+  %16 = load double* %arrayidx2.5, align 8
+  %mul.5 = fmul fast double %16, %15
+  %add.5 = fadd fast double %mul.5, %add.4
+  %arrayidx6.5 = getelementptr inbounds double* %y, i64 5
+  %17 = load double* %arrayidx6.5, align 8
+  %mul7.5 = fmul fast double %17, %15
+  %add8.5 = fadd fast double %mul7.5, %add8.4
+  %arrayidx.6 = getelementptr inbounds double* %c, i64 6
+  %18 = load double* %arrayidx.6, align 8
+  %arrayidx2.6 = getelementptr inbounds double* %x, i64 6
+  %19 = load double* %arrayidx2.6, align 8
+  %mul.6 = fmul fast double %19, %18
+  %add.6 = fadd fast double %mul.6, %add.5
+  %arrayidx6.6 = getelementptr inbounds double* %y, i64 6
+  %20 = load double* %arrayidx6.6, align 8
+  %mul7.6 = fmul fast double %20, %18
+  %add8.6 = fadd fast double %mul7.6, %add8.5
+  %arrayidx.7 = getelementptr inbounds double* %c, i64 7
+  %21 = load double* %arrayidx.7, align 8
+  %arrayidx2.7 = getelementptr inbounds double* %x, i64 7
+  %22 = load double* %arrayidx2.7, align 8
+  %mul.7 = fmul fast double %22, %21
+  %add.7 = fadd fast double %mul.7, %add.6
+  %arrayidx6.7 = getelementptr inbounds double* %y, i64 7
+  %23 = load double* %arrayidx6.7, align 8
+  %mul7.7 = fmul fast double %23, %21
+  %add8.7 = fadd fast double %mul7.7, %add8.6
+  store double %add.7, double* %rx, align 8
+  store double %add8.7, double* %ry, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/PBQP-coalesce-benefit.ll b/test/CodeGen/AArch64/PBQP-coalesce-benefit.ll
new file mode 100644
index 0000000..45ac5e6
--- /dev/null
+++ b/test/CodeGen/AArch64/PBQP-coalesce-benefit.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a57 -mattr=+neon -fp-contract=fast -regalloc=pbqp -pbqp-coalescing | FileCheck %s
+
+; CHECK-LABEL: test:
+define i32 @test(i32 %acc, i32* nocapture readonly %c) {
+entry:
+  %0 = load i32* %c, align 4
+; CHECK-NOT: mov	 w{{[0-9]*}}, w0
+  %add = add nsw i32 %0, %acc
+  %arrayidx1 = getelementptr inbounds i32* %c, i64 1
+  %1 = load i32* %arrayidx1, align 4
+  %add2 = add nsw i32 %add, %1
+  ret i32 %add2
+}
+
diff --git a/test/CodeGen/AArch64/PBQP-csr.ll b/test/CodeGen/AArch64/PBQP-csr.ll
new file mode 100644
index 0000000..64335ae
--- /dev/null
+++ b/test/CodeGen/AArch64/PBQP-csr.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a57 -mattr=+neon -fp-contract=fast -regalloc=pbqp -pbqp-coalescing | FileCheck %s
+
+%pl = type { i32, i32, i32, i32, %p*, %l*, double* }
+%p = type { i32, %ca*, [27 x %ca*], %v*, %v*, %v*, i32 }
+%ca = type { %v, float, i32 }
+%v = type { double, double, double }
+%l = type opaque
+%rs = type { i32, i32, i32, i32, %v*, %v*, [21 x double], %v, %v, %v, double, double, double }
+
+;CHECK-LABEL: test_csr
+define void @test_csr(%pl* nocapture readnone %this, %rs* nocapture %r) align 2 {
+;CHECK-NOT: stp {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %x.i = getelementptr inbounds %rs* %r, i64 0, i32 7, i32 0
+  %y.i = getelementptr inbounds %rs* %r, i64 0, i32 7, i32 1
+  %z.i = getelementptr inbounds %rs* %r, i64 0, i32 7, i32 2
+  %x.i61 = getelementptr inbounds %rs* %r, i64 0, i32 8, i32 0
+  %y.i62 = getelementptr inbounds %rs* %r, i64 0, i32 8, i32 1
+  %z.i63 = getelementptr inbounds %rs* %r, i64 0, i32 8, i32 2
+  %x.i58 = getelementptr inbounds %rs* %r, i64 0, i32 9, i32 0
+  %y.i59 = getelementptr inbounds %rs* %r, i64 0, i32 9, i32 1
+  %z.i60 = getelementptr inbounds %rs* %r, i64 0, i32 9, i32 2
+  %na = getelementptr inbounds %rs* %r, i64 0, i32 0
+  %0 = bitcast double* %x.i to i8*
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 72, i32 8, i1 false)
+  %1 = load i32* %na, align 4
+  %cmp70 = icmp sgt i32 %1, 0
+  br i1 %cmp70, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %fn = getelementptr inbounds %rs* %r, i64 0, i32 4
+  %2 = load %v** %fn, align 8
+  %fs = getelementptr inbounds %rs* %r, i64 0, i32 5
+  %3 = load %v** %fs, align 8
+  %4 = sext i32 %1 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %5 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add6.i, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %6 = phi <2 x double> [ zeroinitializer, %for.body.lr.ph ], [ %17, %for.body ]
+  %7 = phi <2 x double> [ zeroinitializer, %for.body.lr.ph ], [ %22, %for.body ]
+  %8 = phi <2 x double> [ zeroinitializer, %for.body.lr.ph ], [ %26, %for.body ]
+  %9 = phi <2 x double> [ zeroinitializer, %for.body.lr.ph ], [ %28, %for.body ]
+  %x.i54 = getelementptr inbounds %v* %2, i64 %indvars.iv, i32 0
+  %x1.i = getelementptr inbounds %v* %3, i64 %indvars.iv, i32 0
+  %y.i56 = getelementptr inbounds %v* %2, i64 %indvars.iv, i32 1
+  %10 = bitcast double* %x.i54 to <2 x double>*
+  %11 = load <2 x double>* %10, align 8
+  %y2.i = getelementptr inbounds %v* %3, i64 %indvars.iv, i32 1
+  %12 = bitcast double* %x1.i to <2 x double>*
+  %13 = load <2 x double>* %12, align 8
+  %14 = fadd fast <2 x double> %13, %11
+  %z.i57 = getelementptr inbounds %v* %2, i64 %indvars.iv, i32 2
+  %15 = load double* %z.i57, align 8
+  %z4.i = getelementptr inbounds %v* %3, i64 %indvars.iv, i32 2
+  %16 = load double* %z4.i, align 8
+  %add5.i = fadd fast double %16, %15
+  %17 = fadd fast <2 x double> %6, %11
+  %18 = bitcast double* %x.i to <2 x double>*
+  store <2 x double> %17, <2 x double>* %18, align 8
+  %19 = load double* %x1.i, align 8
+  %20 = insertelement <2 x double> undef, double %15, i32 0
+  %21 = insertelement <2 x double> %20, double %19, i32 1
+  %22 = fadd fast <2 x double> %7, %21
+  %23 = bitcast double* %z.i to <2 x double>*
+  store <2 x double> %22, <2 x double>* %23, align 8
+  %24 = bitcast double* %y2.i to <2 x double>*
+  %25 = load <2 x double>* %24, align 8
+  %26 = fadd fast <2 x double> %8, %25
+  %27 = bitcast double* %y.i62 to <2 x double>*
+  store <2 x double> %26, <2 x double>* %27, align 8
+  %28 = fadd fast <2 x double> %14, %9
+  %29 = bitcast double* %x.i58 to <2 x double>*
+  store <2 x double> %28, <2 x double>* %29, align 8
+  %add6.i = fadd fast double %add5.i, %5
+  store double %add6.i, double* %z.i60, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp = icmp slt i64 %indvars.iv.next, %4
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
diff --git a/test/CodeGen/AArch64/PBQP.ll b/test/CodeGen/AArch64/PBQP.ll
new file mode 100644
index 0000000..675a2ca
--- /dev/null
+++ b/test/CodeGen/AArch64/PBQP.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -regalloc=pbqp -pbqp-coalescing -o - %s | FileCheck %s
+
+define i32 @foo(i32 %a) {
+; CHECK-LABEL: foo:
+; CHECK: bl bar
+; CHECK: bl baz
+  %call = call i32 @bar(i32 %a)
+  %call1 = call i32 @baz(i32 %call)
+  ret i32 %call1
+}
+
+declare i32 @bar(i32)
+declare i32 @baz(i32)
+
diff --git a/test/CodeGen/AArch64/Redundantstore.ll b/test/CodeGen/AArch64/Redundantstore.ll
new file mode 100644
index 0000000..72f7f46
--- /dev/null
+++ b/test/CodeGen/AArch64/Redundantstore.ll
@@ -0,0 +1,25 @@
+; RUN: llc -O3 -march=aarch64 < %s | FileCheck %s 
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+@end_of_array = common global i8* null, align 8
+
+; CHECK-LABEL: @test
+; CHECK: stur
+; CHECK-NOT: stur
+define i8* @test(i32 %size) {
+entry:
+  %0 = load i8** @end_of_array, align 8
+  %conv = sext i32 %size to i64
+  %and = and i64 %conv, -8
+  %conv2 = trunc i64 %and to i32
+  %add.ptr.sum = add nsw i64 %and, -4
+  %add.ptr3 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %size4 = bitcast i8* %add.ptr3 to i32*
+  store i32 %conv2, i32* %size4, align 4
+  %add.ptr.sum9 = add nsw i64 %and, -4
+  %add.ptr5 = getelementptr inbounds i8* %0, i64 %add.ptr.sum9
+  %size6 = bitcast i8* %add.ptr5 to i32*
+  store i32 %conv2, i32* %size6, align 4
+  ret i8* %0
+}
+
diff --git a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
new file mode 100644
index 0000000..4da33a0
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
@@ -0,0 +1,106 @@
+; RUN: llc < %s -O2 -mtriple=aarch64-none-linux-gnu 
+
+; Bug 20598
+
+
+define void @test() #0 {
+entry:
+  br label %for.body, !dbg !39
+
+for.body:                                         ; preds = %for.body, %entry
+  %arrayidx5 = getelementptr inbounds i32* null, i64 1, !dbg !43
+  %0 = load i32* null, align 4, !dbg !45, !tbaa !46
+  %s1 = sub nsw i32 0, %0, !dbg !50
+  %n1 = sext i32 %s1 to i64, !dbg !50
+  %arrayidx21 = getelementptr inbounds i32* null, i64 3, !dbg !51
+  %add53 = add nsw i64 %n1, 0, !dbg !52
+  %add55 = add nsw i64 %n1, 0, !dbg !53
+  %mul63 = mul nsw i64 %add53, -20995, !dbg !54
+  tail call void @llvm.dbg.value(metadata !{i64 %mul63}, i64 0, metadata !30, metadata !{metadata !"0x102"}), !dbg !55
+  %mul65 = mul nsw i64 %add55, -3196, !dbg !56
+  %add67 = add nsw i64 0, %mul65, !dbg !57
+  %add80 = add i64 0, 1024, !dbg !58
+  %add81 = add i64 %add80, %mul63, !dbg !58
+  %add82 = add i64 %add81, 0, !dbg !58
+  %shr83351 = lshr i64 %add82, 11, !dbg !58
+  %conv84 = trunc i64 %shr83351 to i32, !dbg !58
+  store i32 %conv84, i32* %arrayidx21, align 4, !dbg !58, !tbaa !46
+  %add86 = add i64 0, 1024, !dbg !59
+  %add87 = add i64 %add86, 0, !dbg !59
+  %add88 = add i64 %add87, %add67, !dbg !59
+  %shr89352 = lshr i64 %add88, 11, !dbg !59
+  %n2 = trunc i64 %shr89352 to i32, !dbg !59
+  store i32 %n2, i32* %arrayidx5, align 4, !dbg !59, !tbaa !46
+  br label %for.body, !dbg !39
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!36, !37}
+!llvm.ident = !{!38}
+
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.6.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [] [] []
+!1 = metadata !{metadata !"test.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00\00\00\00140\000\001\000\006\00256\001\00141", metadata !1, metadata !5, metadata !6, null, void ()* @test, null, null, metadata !12} ; [ DW_TAG_subprogram ] [] [] [def] [scope 141] []
+!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ] [] []
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [] [] [from ]
+!7 = metadata !{null, metadata !8}
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [] [] []
+!9 = metadata !{metadata !"0x16\00\0030\000\000\000\000", metadata !10, null, metadata !11} ; [ DW_TAG_typedef ] [] [] [] [from int]
+!10 = metadata !{metadata !"", metadata !""}
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [] [int] []
+!12 = metadata !{metadata !13, metadata !14, metadata !18, metadata !19, metadata !20, metadata !21, metadata !22, metadata !23, metadata !24, metadata !25, metadata !26, metadata !27, metadata !28, metadata !29, metadata !30, metadata !31, metadata !32, metadata !33, metadata !34, metadata !35}
+!13 = metadata !{metadata !"0x101\00\0016777356\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [] [data] []
+!14 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!15 = metadata !{metadata !"0x16\00\00183\000\000\000\000", metadata !16, null, metadata !17} ; [ DW_TAG_typedef ] [] [INT32] [] [from long int]
+!16 = metadata !{metadata !"", metadata !""}
+!17 = metadata !{metadata !"0x24\00\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [] [long int] []
+!18 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!19 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!20 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!21 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!22 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!23 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!24 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!25 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!26 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!27 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!28 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!29 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!30 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!31 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!32 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [ ] [] []
+!33 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!34 = metadata !{metadata !"0x100\00\00145\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [  ] [] []
+!35 = metadata !{metadata !"0x100\00\00146\000", metadata !4, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [  ] [] []
+!36 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!37 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!38 = metadata !{metadata !"clang version 3.6.0 "}
+!39 = metadata !{i32 154, i32 8, metadata !40, null}
+!40 = metadata !{metadata !"0xb\00154\008\002", metadata !1, metadata !41} ; [ DW_TAG_lexical_block ] [  ] []
+!41 = metadata !{metadata !"0xb\00154\008\001", metadata !1, metadata !42} ; [ DW_TAG_lexical_block ] [  ] []
+!42 = metadata !{metadata !"0xb\00154\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [  ] []
+!43 = metadata !{i32 157, i32 5, metadata !44, null}
+!44 = metadata !{metadata !"0xb\00154\0042\000", metadata !1, metadata !42} ; [ DW_TAG_lexical_block ] [  ] []
+!45 = metadata !{i32 159, i32 5, metadata !44, null}
+!46 = metadata !{metadata !47, metadata !47, i64 0}
+!47 = metadata !{metadata !"int", metadata !48, i64 0}
+!48 = metadata !{metadata !"omnipotent char", metadata !49, i64 0}
+!49 = metadata !{metadata !"Simple C/C++ TBAA"}
+!50 = metadata !{i32 160, i32 5, metadata !44, null}
+!51 = metadata !{i32 161, i32 5, metadata !44, null}
+!52 = metadata !{i32 188, i32 5, metadata !44, null}
+!53 = metadata !{i32 190, i32 5, metadata !44, null}
+!54 = metadata !{i32 198, i32 5, metadata !44, null}
+!55 = metadata !{i32 144, i32 13, metadata !4, null}
+!56 = metadata !{i32 200, i32 5, metadata !44, null}
+!57 = metadata !{i32 203, i32 5, metadata !44, null}
+!58 = metadata !{i32 207, i32 5, metadata !44, null}
+!59 = metadata !{i32 208, i32 5, metadata !44, null}
diff --git a/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
index fb229fc..7108bc0 100644
--- a/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
+++ b/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
-; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
+; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A57 --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A57 --check-prefix CHECK-ODD
+; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-ODD
 
 ; Test the AArch64A57FPLoadBalancing pass. This pass relies heavily on register allocation, so
 ; our test strategy is to:
@@ -73,7 +75,9 @@ entry:
 ; CHECK: fmsub [[x]]
 ; CHECK: fmadd [[y]]
 ; CHECK: fmadd [[x]]
-; CHECK: stp [[x]], [[y]]
+; CHECK-A57: stp [[x]], [[y]]
+; CHECK-A53-DAG: str [[x]]
+; CHECK-A53-DAG: str [[y]]
 
 define void @f2(double* nocapture readonly %p, double* nocapture %q) #0 {
 entry:
@@ -166,7 +170,9 @@ declare void @g(...) #1
 ; CHECK: fmsub [[x]]
 ; CHECK: fmadd [[y]]
 ; CHECK: fmadd [[x]]
-; CHECK: stp [[x]], [[y]]
+; CHECK-A57: stp [[x]], [[y]]
+; CHECK-A53-DAG: str [[x]]
+; CHECK-A53-DAG: str [[y]]
 
 define void @f4(float* nocapture readonly %p, float* nocapture %q) #0 {
 entry:
diff --git a/test/CodeGen/AArch64/aarch64-be-bv.ll b/test/CodeGen/AArch64/aarch64-be-bv.ll
new file mode 100644
index 0000000..01642a4
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-be-bv.ll
@@ -0,0 +1,831 @@
+; RUN: llc -mtriple=aarch64_be--linux-gnu < %s | FileCheck %s
+
+@vec_v8i16 = global <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+
+; CHECK-LABEL: movi_modimm_t1:
+define i16 @movi_modimm_t1() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t2:
+define i16 @movi_modimm_t2() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t3:
+define i16 @movi_modimm_t3() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t4:
+define i16 @movi_modimm_t4() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t5:
+define i16 @movi_modimm_t5() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].8h, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t6:
+define i16 @movi_modimm_t6() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t7:
+define i16 @movi_modimm_t7() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, msl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 511, i16 0, i16 511, i16 0, i16 511, i16 0, i16 511, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t8:
+define i16 @movi_modimm_t8() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, msl #16
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65535, i16 1, i16 65535, i16 1, i16 65535, i16 1, i16 65535, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t9:
+define i16 @movi_modimm_t9() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].16b, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t10:
+define i16 @movi_modimm_t10() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].2d, #0x00ffff0000ffff
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 -1, i16 0, i16 -1, i16 0, i16 -1, i16 0, i16 -1, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: fmov_modimm_t11:
+define i16 @fmov_modimm_t11() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    fmov    v[[REG2:[0-9]+]].4s, #3.00000000
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 16448, i16 0, i16 16448, i16 0, i16 16448, i16 0, i16 16448>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: fmov_modimm_t12:
+define i16 @fmov_modimm_t12() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    fmov    v[[REG2:[0-9]+]].2d, #0.17968750
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 0, i16 0, i16 16327, i16 0, i16 0, i16 0, i16 16327>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t1:
+define i16 @mvni_modimm_t1() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t2:
+define i16 @mvni_modimm_t2() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t3:
+define i16 @mvni_modimm_t3() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t4:
+define i16 @mvni_modimm_t4() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t5:
+define i16 @mvni_modimm_t5() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].8h, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t6:
+define i16 @mvni_modimm_t6() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t7:
+define i16 @mvni_modimm_t7() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, msl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65024, i16 65535, i16 65024, i16 65535, i16 65024, i16 65535, i16 65024, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t8:
+define i16 @mvni_modimm_t8() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, msl #16
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 65534, i16 0, i16 65534, i16 0, i16 65534, i16 0, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t1:
+define i16 @bic_modimm_t1() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].4s, #0x1
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t2:
+define i16 @bic_modimm_t2() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t3:
+define i16 @bic_modimm_t3() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t4:
+define i16 @bic_modimm_t4() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t5:
+define i16 @bic_modimm_t5() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].8h, #0x1
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t6:
+define i16 @bic_modimm_t6() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t1:
+define i16 @orr_modimm_t1() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].4s, #0x1
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t2:
+define i16 @orr_modimm_t2() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr     v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t3:
+define i16 @orr_modimm_t3() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t4:
+define i16 @orr_modimm_t4() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t5:
+define i16 @orr_modimm_t5() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].8h, #0x1
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t6:
+define i16 @orr_modimm_t6() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+declare i8 @f_v8i8(<8 x i8> %arg)
+declare i16 @f_v4i16(<4 x i16> %arg)
+declare i32 @f_v2i32(<2 x i32> %arg)
+declare i64 @f_v1i64(<1 x i64> %arg)
+declare i8 @f_v16i8(<16 x i8> %arg)
+declare i16 @f_v8i16(<8 x i16> %arg)
+declare i32 @f_v4i32(<4 x i32> %arg)
+declare i64 @f_v2i64(<2 x i64> %arg)
+
+; CHECK-LABEL: modimm_t1_call:
+define void @modimm_t1_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 8, i8 0, i8 0, i8 0, i8 8, i8 0, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 7, i16 0, i16 7, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 6, i32 6>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 21474836485>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 4, i16 0, i16 4, i16 0, i16 4, i16 0, i16 4, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 8589934594, i64 8589934594>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t2_call:
+define void @modimm_t2_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 8, i8 0, i8 0, i8 0, i8 8, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 1792, i16 0, i16 1792, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 1536, i32 1536>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, lsl #8
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 5497558140160>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 1024, i16 0, i16 1024, i16 0, i16 1024, i16 0, i16 1024, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 768, i32 768, i32 768, i32 768>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, lsl #8
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 2199023256064, i64 2199023256064>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t3_call:
+define void @modimm_t3_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, lsl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 0, i8 8, i8 0, i8 0, i8 0, i8 8, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, lsl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 0, i16 7, i16 0, i16 7>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, lsl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 393216, i32 393216>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, lsl #16
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 1407374883880960>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, lsl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, lsl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 4, i16 0, i16 4, i16 0, i16 4, i16 0, i16 4>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, lsl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 196608, i32 196608, i32 196608, i32 196608>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, lsl #16
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 562949953552384, i64 562949953552384>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t4_call:
+define void @modimm_t4_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, lsl #24
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 0, i8 0, i8 8, i8 0, i8 0, i8 0, i8 8>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, lsl #24
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 0, i16 1792, i16 0, i16 1792>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, lsl #24
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 100663296, i32 100663296>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, lsl #24
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 360287970273525760>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, lsl #24
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, lsl #24
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 1024, i16 0, i16 1024, i16 0, i16 1024, i16 0, i16 1024>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, lsl #24
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 50331648, i32 50331648, i32 50331648, i32 50331648>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, lsl #24
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 144115188109410304, i64 144115188109410304>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t5_call:
+define void @modimm_t5_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 8, i8 0, i8 8, i8 0, i8 8, i8 0, i8 8, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x7
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 7, i16 7, i16 7, i16 7>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x6
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 393222, i32 393222>)
+  ; CHECK:         movi    v{{[0-9]+}}.4h, #0x5
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 1407396358717445>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x5
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x4
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x3
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 196611, i32 196611, i32 196611, i32 196611>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].8h, #0x2
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 562958543486978, i64 562958543486978>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t6_call:
+define void @modimm_t6_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x8, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 8, i8 0, i8 8, i8 0, i8 8, i8 0, i8 8>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x7, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 1792, i16 1792, i16 1792, i16 1792>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x6, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 100664832, i32 100664832>)
+  ; CHECK:         movi    v{{[0-9]+}}.4h, #0x5, lsl #8
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 360293467831665920>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x5, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x4, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 1024, i16 1024, i16 1024, i16 1024, i16 1024, i16 1024, i16 1024, i16 1024>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x3, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 50332416, i32 50332416, i32 50332416, i32 50332416>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].8h, #0x2, lsl #8
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 144117387132666368, i64 144117387132666368>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t7_call:
+define void @modimm_t7_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, msl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 255, i8 8, i8 0, i8 0, i8 255, i8 8, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, msl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 2047, i16 0, i16 2047, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, msl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 1791, i32 1791>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, msl #8
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 6592774800895>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, msl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 255, i8 5, i8 0, i8 0, i8 255, i8 5, i8 0, i8 0, i8 255, i8 5, i8 0, i8 0, i8 255, i8 5, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, msl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 1279, i16 0, i16 1279, i16 0, i16 1279, i16 0, i16 1279, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, msl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 1023, i32 1023, i32 1023, i32 1023>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, msl #8
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 3294239916799, i64 3294239916799>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t8_call:
+define void @modimm_t8_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, msl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 255, i8 255, i8 8, i8 0, i8 255, i8 255, i8 8, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, msl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 65535, i16 7, i16 65535, i16 7>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, msl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 458751, i32 458751>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, msl #16
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 1688845565689855>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, msl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 255, i8 255, i8 5, i8 0, i8 255, i8 255, i8 5, i8 0, i8 255, i8 255, i8 5, i8 0, i8 255, i8 255, i8 5, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, msl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 65535, i16 4, i16 65535, i16 4, i16 65535, i16 4, i16 65535, i16 4>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, msl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 262143, i32 262143, i32 262143, i32 262143>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, msl #16
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 844420635361279, i64 844420635361279>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t9_call:
+define void @modimm_t9_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8b, #0x8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8b, #0x7
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 1799, i16 1799, i16 1799, i16 1799>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8b, #0x6
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 101058054, i32 101058054>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].16b, #0x5
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].16b, #0x4
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].16b, #0x3
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 50529027, i32 50529027, i32 50529027, i32 50529027>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t10_call:
+define void @modimm_t10_call() {
+  ; CHECK:         movi    d[[REG1:[0-9]+]], #0x0000ff000000ff
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0>)
+  ; CHECK:         movi    d[[REG1:[0-9]+]], #0x00ffff0000ffff
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 -1, i16 0, i16 -1, i16 0>)
+  ; CHECK:         movi    d[[REG1:[0-9]+]], #0xffffffffffffffff
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 -1, i32 -1>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2d, #0xffffff00ffffff
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2d, #0xffffffffffff0000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 -1, i16 -1, i16 -1, i16 0, i16 -1, i16 -1, i16 -1>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2d, #0xffffffff00000000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 0, i32 -1, i32 0, i32 -1>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t11_call:
+define void @modimm_t11_call() {
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2s, #4.00000000
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 0, i8 128, i8 64, i8 0, i8 0, i8 128, i8 64>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2s, #3.75000000
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 0, i16 16496, i16 0, i16 16496>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2s, #3.50000000
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 1080033280, i32 1080033280>)
+  ; CHECK:         fmov    v{{[0-9]+}}.2s, #0.39062500
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 4523865826746957824>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].4s, #3.25000000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 80, i8 64, i8 0, i8 0, i8 80, i8 64, i8 0, i8 0, i8 80, i8 64, i8 0, i8 0, i8 80, i8 64>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].4s, #3.00000000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 16448, i16 0, i16 16448, i16 0, i16 16448, i16 0, i16 16448>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].4s, #2.75000000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 1076887552, i32 1076887552, i32 1076887552, i32 1076887552>)
+  ; CHECK:         fmov    v[[REG:[0-9]+]].4s, #2.5000000
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 4620693218757967872, i64 4620693218757967872>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t12_call:
+define void @modimm_t12_call() {
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2d, #0.18750000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 200, i8 63, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 200, i8 63>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2d, #0.17968750
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 16327, i16 0, i16 0, i16 0, i16 16327>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2d, #0.17187500
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 0, i32 1069940736, i32 0, i32 1069940736>)
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/aarch64-gep-opt.ll b/test/CodeGen/AArch64/aarch64-gep-opt.ll
new file mode 100644
index 0000000..811eed9
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-gep-opt.ll
@@ -0,0 +1,163 @@
+; RUN: llc -O3 -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -O3 -print-after=codegenprepare -mcpu=cyclone < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-NoAA <%t %s
+; RUN: llc -O3 -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-UseAA <%t %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnueabi"
+
+; Following test cases test enabling SeparateConstOffsetFromGEP pass in AArch64
+; backend. If useAA() returns true, it will lower a GEP with multiple indices
+; into GEPs with a single index, otherwise it will lower it into a
+; "ptrtoint+arithmetics+inttoptr" form.
+
+%struct = type { i32, i32, i32, i32, [20 x i32] }
+
+; Check that when two complex GEPs are used in two basic blocks, LLVM can
+; elimilate the common subexpression for the second use.
+define void @test_GEP_CSE([240 x %struct]* %string, i32* %adj, i32 %lib, i64 %idxprom) {
+  %liberties = getelementptr [240 x %struct]* %string, i64 1, i64 %idxprom, i32 3
+  %1 = load i32* %liberties, align 4
+  %cmp = icmp eq i32 %1, %lib
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %origin = getelementptr [240 x %struct]* %string, i64 1, i64 %idxprom, i32 2
+  %2 = load i32* %origin, align 4
+  store i32 %2, i32* %adj, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; CHECK-LABEL: test_GEP_CSE:
+; CHECK: madd
+; CHECK: ldr
+; CHECK-NOT: madd
+; CHECK:ldr
+
+; CHECK-NoAA-LABEL: @test_GEP_CSE(
+; CHECK-NoAA: [[PTR0:%[a-zA-Z0-9]+]] = ptrtoint [240 x %struct]* %string to i64
+; CHECK-NoAA: [[PTR1:%[a-zA-Z0-9]+]] = mul i64 %idxprom, 96
+; CHECK-NoAA: [[PTR2:%[a-zA-Z0-9]+]] = add i64 [[PTR0]], [[PTR1]]
+; CHECK-NoAA: add i64 [[PTR2]], 23052
+; CHECK-NoAA: inttoptr
+; CHECK-NoAA: if.then:
+; CHECK-NoAA-NOT: ptrtoint
+; CHECK-NoAA-NOT: mul
+; CHECK-NoAA: add i64 [[PTR2]], 23048
+; CHECK-NoAA: inttoptr
+
+; CHECK-UseAA-LABEL: @test_GEP_CSE(
+; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = bitcast [240 x %struct]* %string to i8*
+; CHECK-UseAA: [[IDX:%[a-zA-Z0-9]+]] = mul i64 %idxprom, 96
+; CHECK-UseAA: [[PTR1:%[a-zA-Z0-9]+]] = getelementptr i8* [[PTR0]], i64 [[IDX]]
+; CHECK-UseAA: getelementptr i8* [[PTR1]], i64 23052
+; CHECK-UseAA: bitcast
+; CHECK-UseAA: if.then:
+; CHECK-UseAA: getelementptr i8* [[PTR1]], i64 23048
+; CHECK-UseAA: bitcast
+
+%class.my = type { i32, [128 x i32], i32, [256 x %struct.pt]}
+%struct.pt = type { %struct.point*, i32, i32 }
+%struct.point = type { i32, i32 }
+
+; Check when a GEP is used across two basic block, LLVM can sink the address
+; calculation and code gen can generate a better addressing mode for the second
+; use.
+define void @test_GEP_across_BB(%class.my* %this, i64 %idx) {
+  %1 = getelementptr %class.my* %this, i64 0, i32 3, i64 %idx, i32 1
+  %2 = load i32* %1, align 4
+  %3 = getelementptr %class.my* %this, i64 0, i32 3, i64 %idx, i32 2
+  %4 = load i32* %3, align 4
+  %5 = icmp eq i32 %2, %4
+  br i1 %5, label %if.true, label %exit
+
+if.true:
+  %6 = shl i32 %4, 1
+  store i32 %6, i32* %3, align 4
+  br label %exit
+
+exit:
+  %7 = add nsw i32 %4, 1
+  store i32 %7, i32* %1, align 4
+  ret void
+}
+; CHECK-LABEL: test_GEP_across_BB:
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #528]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #532]
+; CHECK-NOT: add
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #532]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #528]
+
+; CHECK-NoAA-LABEL: test_GEP_across_BB(
+; CHECK-NoAA: add i64 [[TMP:%[a-zA-Z0-9]+]], 528
+; CHECK-NoAA: add i64 [[TMP]], 532
+; CHECK-NoAA: if.true:
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 532
+; CHECK-NoAA: exit:
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 528
+
+; CHECK-UseAA-LABEL: test_GEP_across_BB(
+; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = getelementptr
+; CHECK-UseAA: getelementptr i8* [[PTR0]], i64 528
+; CHECK-UseAA: getelementptr i8* [[PTR0]], i64 532
+; CHECK-UseAA: if.true:
+; CHECK-UseAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8* [[PTR0]], i64 532
+; CHECK-UseAA: exit:
+; CHECK-UseAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8* [[PTR0]], i64 528
+
+%struct.S = type { float, double }
+@struct_array = global [1024 x %struct.S] zeroinitializer, align 16
+
+; The following two test cases check we can extract constant from indices of
+; struct type.
+; The constant offsets are from indices "i64 %idxprom" and "i32 1". As the
+; alloca size of %struct.S is 16, and "i32 1" is the 2rd element whose field
+; offset is 8, the total constant offset is (5 * 16 + 8) = 88.
+define double* @test-struct_1(i32 %i) {
+entry:
+  %add = add nsw i32 %i, 5
+  %idxprom = sext i32 %add to i64
+  %p = getelementptr [1024 x %struct.S]* @struct_array, i64 0, i64 %idxprom, i32 1
+  ret double* %p
+}
+; CHECK-NoAA-LABEL: @test-struct_1(
+; CHECK-NoAA-NOT: getelementptr
+; CHECK-NoAA: add i64 %{{[a-zA-Z0-9]+}}, 88
+
+; CHECK-UseAA-LABEL: @test-struct_1(
+; CHECK-UseAA: getelementptr i8* %{{[a-zA-Z0-9]+}}, i64 88
+
+%struct3 = type { i64, i32 }
+%struct2 = type { %struct3, i32 }
+%struct1 = type { i64, %struct2 }
+%struct0 = type { i32, i32, i64*, [100 x %struct1] }
+
+; The constant offsets are from indices "i32 3", "i64 %arrayidx" and "i32 1".
+; "i32 3" is the 4th element whose field offset is 16. The alloca size of
+; %struct1 is 32. "i32 1" is the 2rd element whose field offset is 8. So the
+; total constant offset is 16 + (-2 * 32) + 8 = -40
+define %struct2* @test-struct_2(%struct0* %ptr, i64 %idx) {
+entry:
+  %arrayidx = add nsw i64 %idx, -2
+  %ptr2 = getelementptr %struct0* %ptr, i64 0, i32 3, i64 %arrayidx, i32 1
+  ret %struct2* %ptr2
+}
+; CHECK-NoAA-LABEL: @test-struct_2(
+; CHECK-NoAA-NOT: = getelementptr
+; CHECK-NoAA: add i64 %{{[a-zA-Z0-9]+}}, -40
+
+; CHECK-UseAA-LABEL: @test-struct_2(
+; CHECK-UseAA: getelementptr i8* %{{[a-zA-Z0-9]+}}, i64 -40
+
+; Test that when a index is added from two constant, SeparateConstOffsetFromGEP
+; pass does not generate incorrect result.
+define void @test_const_add([3 x i32]* %in) {
+  %inc = add nsw i32 2, 1
+  %idxprom = sext i32 %inc to i64
+  %arrayidx = getelementptr [3 x i32]* %in, i64 %idxprom, i64 2
+  store i32 0, i32* %arrayidx, align 4
+  ret void
+}
+; CHECK-LABEL: test_const_add:
+; CHECK: str wzr, [x0, #44]
diff --git a/test/CodeGen/AArch64/aarch64-smull.ll b/test/CodeGen/AArch64/aarch64-smull.ll
new file mode 100644
index 0000000..92582d7
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-smull.ll
@@ -0,0 +1,332 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s
+
+define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+; CHECK-LABEL: smull_v8i8_v8i16:
+; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = mul <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+; CHECK-LABEL: smull_v4i16_v4i32:
+; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = mul <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+; CHECK-LABEL: smull_v2i32_v2i64:
+; CHECK:  smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = mul <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+; CHECK-LABEL: umull_v8i8_v8i16:
+; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = mul <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+; CHECK-LABEL: umull_v4i16_v4i32:
+; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = mul <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+; CHECK-LABEL: umull_v2i32_v2i64:
+; CHECK:  umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = mul <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: smlal_v8i8_v8i16:
+; CHECK:  smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = add <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: smlal_v4i16_v4i32:
+; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = add <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: smlal_v2i32_v2i64:
+; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = add <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: umlal_v8i8_v8i16:
+; CHECK:  umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = add <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: umlal_v4i16_v4i32:
+; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = add <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: umlal_v2i32_v2i64:
+; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = add <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: smlsl_v8i8_v8i16:
+; CHECK:  smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = sub <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: smlsl_v4i16_v4i32:
+; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = sub <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: smlsl_v2i32_v2i64:
+; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = sub <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: umlsl_v8i8_v8i16:
+; CHECK:  umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = sub <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: umlsl_v4i16_v4i32:
+; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = sub <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: umlsl_v2i32_v2i64:
+; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = sub <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
+define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; CHECK-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp3 = sext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
+  ret <8 x i16> %tmp4
+}
+
+define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; Do not use SMULL if the BUILD_VECTOR element values are too big.
+; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK: movz
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+  %tmp3 = sext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
+; CHECK-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK:  smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp3 = sext <4 x i16> %arg to <4 x i32>
+  %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
+; CHECK: smull_extvec_v2i32_v2i64
+; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp3 = sext <2 x i32> %arg to <2 x i64>
+  %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
+  ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; CHECK-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp3 = zext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
+  ret <8 x i16> %tmp4
+}
+
+define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; Do not use SMULL if the BUILD_VECTOR element values are too big.
+; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK: movz
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+  %tmp3 = zext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
+; CHECK-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK:  umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp3 = zext <4 x i16> %arg to <4 x i32>
+  %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
+; CHECK-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp3 = zext <2 x i32> %arg to <2 x i64>
+  %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
+  ret <2 x i64> %tmp4
+}
+
+define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) {
+; If one operand has a zero-extend and the other a sign-extend, smull
+; cannot be used.
+; CHECK-LABEL: smullWithInconsistentExtensions:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+  %1 = sext <8 x i8> %vec to <8 x i16>
+  %2 = mul <8 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %3 = extractelement <8 x i16> %2, i32 0
+  ret i16 %3
+}
+
+define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
+entry:
+; CHECK-LABEL: distribute:
+; CHECK: umull [[REG1:(v[0-9]+.8h)]], {{v[0-9]+}}.8b, [[REG2:(v[0-9]+.8b)]]
+; CHECK: umlal [[REG1]], {{v[0-9]+}}.8b, [[REG2]]
+  %0 = trunc i32 %mul to i8
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %3 = tail call <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8* %src, i32 1)
+  %4 = bitcast <16 x i8> %3 to <2 x double>
+  %5 = extractelement <2 x double> %4, i32 1
+  %6 = bitcast double %5 to <8 x i8>
+  %7 = zext <8 x i8> %6 to <8 x i16>
+  %8 = zext <8 x i8> %2 to <8 x i16>
+  %9 = extractelement <2 x double> %4, i32 0
+  %10 = bitcast double %9 to <8 x i8>
+  %11 = zext <8 x i8> %10 to <8 x i16>
+  %12 = add <8 x i16> %7, %11
+  %13 = mul <8 x i16> %12, %8
+  %14 = bitcast i16* %dst to i8*
+  tail call void @llvm.aarch64.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
+  ret void
+}
+
+declare <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8*, i32) nounwind readonly
+
+declare void @llvm.aarch64.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+
diff --git a/test/CodeGen/AArch64/aarch64-wide-shuffle.ll b/test/CodeGen/AArch64/aarch64-wide-shuffle.ll
new file mode 100644
index 0000000..d06df7a
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-wide-shuffle.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define <4 x i16> @f(<4 x i32> %vqdmlal_v3.i, <8 x i16> %x5) {
+entry:
+  ; Check that we don't just dup the input vector. The code emitted is ext, dup, ext, ext
+  ; but only match the last three instructions as the first two could be combined to
+  ; a dup2 at some stage.
+  ; CHECK: dup
+  ; CHECK: ext
+  ; CHECK: ext
+  %x4 = extractelement <4 x i32> %vqdmlal_v3.i, i32 2
+  %vgetq_lane = trunc i32 %x4 to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vgetq_lane, i32 0
+  %vecinit2.i = insertelement <4 x i16> %vecinit.i, i16 %vgetq_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vgetq_lane, i32 3
+  %vgetq_lane261 = extractelement <8 x i16> %x5, i32 0
+  %vset_lane267 = insertelement <4 x i16> %vecinit3.i, i16 %vgetq_lane261, i32 1
+  ret <4 x i16> %vset_lane267
+}
diff --git a/test/CodeGen/AArch64/aarch64_f16_be.ll b/test/CodeGen/AArch64/aarch64_f16_be.ll
new file mode 100644
index 0000000..7504439
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64_f16_be.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mtriple=aarch64-linux-gnuabi -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-linux-gnuabi -O0 < %s | FileCheck %s --check-prefix=CHECK-BE
+
+define void @test_bitcast_v8f16_to_v4f32(<8 x half> %a) {
+; CHECK-LABEL: test_bitcast_v8f16_to_v4f32:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v8f16_to_v4f32:
+; CHECK-BE: st1
+
+  %x = alloca <4 x float>, align 16
+  %y = bitcast <8 x half> %a to <4 x float>
+  store <4 x float> %y, <4 x float>* %x, align 16
+  ret void
+}
+
+define void @test_bitcast_v8f16_to_v2f64(<8 x half> %a) {
+; CHECK-LABEL: test_bitcast_v8f16_to_v2f64:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v8f16_to_v2f64:
+; CHECK-BE: st1
+
+  %x = alloca <2 x double>, align 16
+  %y = bitcast <8 x half> %a to <2 x double>
+  store <2 x double> %y, <2 x double>* %x, align 16
+  ret void
+}
+
+define void @test_bitcast_v8f16_to_fp128(<8 x half> %a) {
+; CHECK-LABEL: test_bitcast_v8f16_to_fp128:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v8f16_to_fp128:
+; CHECK-BE: st1
+
+  %x = alloca fp128, align 16
+  %y = bitcast <8 x half> %a to fp128
+  store fp128 %y, fp128* %x, align 16
+  ret void
+}
+
+define void @test_bitcast_v4f16_to_v2f32(<4 x half> %a) {
+; CHECK-LABEL: test_bitcast_v4f16_to_v2f32:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v4f16_to_v2f32:
+; CHECK-BE: st1
+
+  %x = alloca <2 x float>, align 8
+  %y = bitcast <4 x half> %a to <2 x float>
+  store <2 x float> %y, <2 x float>* %x, align 8
+  ret void
+}
+
+define void @test_bitcast_v4f16_to_v1f64(<4 x half> %a) {
+; CHECK-LABEL: test_bitcast_v4f16_to_v1f64:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v4f16_to_v1f64:
+; CHECK-BE: st1
+
+  %x = alloca <1 x double>, align 8
+  %y = bitcast <4 x half> %a to <1 x double>
+  store <1 x double> %y, <1 x double>* %x, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/aarch64_tree_tests.ll b/test/CodeGen/AArch64/aarch64_tree_tests.ll
new file mode 100644
index 0000000..08e506a
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64_tree_tests.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s | FileCheck %s 
+
+; ModuleID = 'aarch64_tree_tests.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64--linux-gnu"
+
+; CHECK-LABLE: @aarch64_tree_tests_and
+; CHECK: .hword	32768                   
+; CHECK: .hword	32767                   
+; CHECK: .hword	4664                    
+; CHECK: .hword	32767                   
+; CHECK: .hword	32768                   
+; CHECK: .hword	32768                   
+; CHECK: .hword	0                       
+; CHECK: .hword	0                      
+
+; Function Attrs: nounwind readnone
+define <8 x i16> @aarch64_tree_tests_and(<8 x i16> %a) {
+entry:
+  %and = and <8 x i16> <i16 0, i16 undef, i16 undef, i16 0, i16 0, i16 undef, i16 undef, i16 0>, %a
+  %ret = add <8 x i16> %and, <i16 -32768, i16 32767, i16 4664, i16 32767, i16 -32768, i16 -32768, i16 0, i16 0>
+  ret <8 x i16> %ret
+}
+
+; CHECK-LABLE: @aarch64_tree_tests_or
+; CHECK: .hword	32768                 
+; CHECK: .hword	32766
+; CHECK: .hword	4664     
+; CHECK: .hword	32766                
+; CHECK: .hword	32768 
+; CHECK: .hword	32768
+; CHECK: .hword	65535            
+; CHECK: .hword	65535
+
+; Function Attrs: nounwind readnone
+define <8 x i16> @aarch64_tree_tests_or(<8 x i16> %a) {
+entry:
+  %or = or <8 x i16> <i16 -1, i16 undef, i16 undef, i16 -1, i16 -1, i16 undef, i16 undef, i16 -1>, %a
+  %ret = add <8 x i16> %or, <i16 -32767, i16 32767, i16 4665, i16 32767, i16 -32767, i16 -32767, i16 0, i16 0>
+  ret <8 x i16> %ret
+}
+
diff --git a/test/CodeGen/AArch64/adc.ll b/test/CodeGen/AArch64/adc.ll
index 892573b..0488ee2 100644
--- a/test/CodeGen/AArch64/adc.ll
+++ b/test/CodeGen/AArch64/adc.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
 
 define i128 @test_simple(i128 %a, i128 %b, i128 %c) {
 ; CHECK-LABEL: test_simple:
diff --git a/test/CodeGen/AArch64/analyzecmp.ll b/test/CodeGen/AArch64/analyzecmp.ll
new file mode 100644
index 0000000..8962505
--- /dev/null
+++ b/test/CodeGen/AArch64/analyzecmp.ll
@@ -0,0 +1,32 @@
+; RUN: llc -O3 -mcpu=cortex-a57 < %s | FileCheck %s 
+
+; CHECK-LABLE: @test
+; CHECK: tst [[CMP:x[0-9]+]], #0x8000000000000000
+; CHECK: csel [[R0:x[0-9]+]], [[S0:x[0-9]+]], [[S1:x[0-9]+]], eq
+; CHECK: csel [[R1:x[0-9]+]], [[S2:x[0-9]+]], [[S3:x[0-9]+]], eq
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64--linux-gnueabi"
+
+define void @test(i64 %a, i64* %ptr1, i64* %ptr2) #0 align 2 {
+entry:
+  %conv = and i64 %a, 4294967295
+  %add = add nsw i64 %conv, -1
+  %div = sdiv i64 %add, 64
+  %rem = srem i64 %add, 64
+  %cmp = icmp slt i64 %rem, 0
+  br i1 %cmp, label %if.then, label %exit
+
+if.then:                                
+  %add2 = add nsw i64 %rem, 64
+  %add3 = add i64 %div, -1
+  br label %exit
+
+exit:                 
+  %__n = phi i64 [ %add3, %if.then ], [ %div, %entry ]
+  %__n.0 = phi i64 [ %add2, %if.then ], [ %rem, %entry ]
+  store i64 %__n, i64* %ptr1
+  store i64 %__n.0, i64* %ptr2
+  ret void 
+}
+
+
diff --git a/test/CodeGen/AArch64/and-mask-removal.ll b/test/CodeGen/AArch64/and-mask-removal.ll
new file mode 100644
index 0000000..f803b85
--- /dev/null
+++ b/test/CodeGen/AArch64/and-mask-removal.ll
@@ -0,0 +1,269 @@
+; RUN: llc -O0 -fast-isel=false -mtriple=arm64-apple-darwin  < %s  | FileCheck %s
+
+@board = common global [400 x i8] zeroinitializer, align 1
+@next_string = common global i32 0, align 4
+@string_number = common global [400 x i32] zeroinitializer, align 4
+
+; Function Attrs: nounwind ssp
+define void @new_position(i32 %pos) {
+entry:
+  %idxprom = sext i32 %pos to i64
+  %arrayidx = getelementptr inbounds [400 x i8]* @board, i64 0, i64 %idxprom
+  %tmp = load i8* %arrayidx, align 1
+  %.off = add i8 %tmp, -1
+  %switch = icmp ult i8 %.off, 2
+  br i1 %switch, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %tmp1 = load i32* @next_string, align 4
+  %arrayidx8 = getelementptr inbounds [400 x i32]* @string_number, i64 0, i64 %idxprom
+  store i32 %tmp1, i32* %arrayidx8, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+; CHECK-LABEL: new_position
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_0(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 74
+  %1 = icmp ult i8 %0, -20
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_0
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_1(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 246
+  %1 = icmp uge i8 %0, 90
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_1
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_2(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 227
+  %1 = icmp ne i8 %0, 179
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_2
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_3(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 201
+  %1 = icmp eq i8 %0, 154
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_3
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_4(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, -79
+  %1 = icmp ne i8 %0, -40
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_4
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_5(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 133
+  %1 = icmp uge i8 %0, -105
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_5
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_6(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, -58
+  %1 = icmp uge i8 %0, 155
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_6
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_7(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 225
+  %1 = icmp ult i8 %0, 124
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_7
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+
+
+define zeroext i1 @test8_8(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 190
+  %1 = icmp uge i8 %0, 1
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_8
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_0(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -46989
+  %1 = icmp ne i16 %0, -41903
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_0
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_2(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, 16882
+  %1 = icmp ule i16 %0, -24837
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_2
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_3(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, 29283
+  %1 = icmp ne i16 %0, 16947
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_3
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_4(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -35551
+  %1 = icmp uge i16 %0, 15677
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_4
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_5(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -25214
+  %1 = icmp ne i16 %0, -1932
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_5
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_6(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -32194
+  %1 = icmp uge i16 %0, -41215
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_6
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_7(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, 9272
+  %1 = icmp uge i16 %0, -42916
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_7
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_8(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -63749
+  %1 = icmp ne i16 %0, 6706
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_8
+; CHECK-NOT: and
+; CHECK: ret
+}
+
diff --git a/test/CodeGen/AArch64/andandshift.ll b/test/CodeGen/AArch64/andandshift.ll
new file mode 100644
index 0000000..e2c7a09
--- /dev/null
+++ b/test/CodeGen/AArch64/andandshift.ll
@@ -0,0 +1,28 @@
+; RUN: llc -O3 < %s | FileCheck %s 
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64--linux-gnu"
+
+; Function Attrs: nounwind readnone
+define i32 @test1(i8 %a) {
+; CHECK-LABLE: @test1
+; CHECK: ubfx {{w[0-9]+}}, w0, #3, #5
+entry:
+  %conv = zext i8 %a to i32
+  %shr1 = lshr i32 %conv, 3
+  ret i32 %shr1
+}
+
+; Function Attrs: nounwind readnone
+define i32 @test2(i8 %a) {
+; CHECK-LABLE: @test2
+; CHECK: and {{w[0-9]+}}, w0, #0xff
+; CHECK: ubfx {{w[0-9]+}}, w0, #3, #5
+entry:
+  %conv = zext i8 %a to i32
+  %cmp = icmp ugt i8 %a, 47
+  %shr5 = lshr i32 %conv, 3
+  %retval.0 = select i1 %cmp, i32 %shr5, i32 %conv
+  ret i32 %retval.0
+}
+
+
diff --git a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
index 2b083d8..e57a8c9 100644
--- a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
+++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
@@ -11,34 +11,34 @@ if.then24:                                        ; preds = %entry
   unreachable
 
 if.else295:                                       ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16), !dbg !18
+  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
   store i32 0, i32* %do_tab_convert, align 4, !dbg !19
   unreachable
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.gv = !{!0}
 !llvm.dbg.sp = !{!1, !7, !10, !11, !12}
 
-!0 = metadata !{i32 589876, i32 0, metadata !1, metadata !"vsplive", metadata !"vsplive", metadata !"", metadata !2, i32 617, metadata !6, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
-!1 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"drt_vsprintf", metadata !"drt_vsprintf", metadata !"", i32 616, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!0 = metadata !{metadata !"0x34\00vsplive\00vsplive\00\00617\001\001", metadata !1, metadata !2, metadata !6, null, null} ; [ DW_TAG_variable ]
+!1 = metadata !{metadata !"0x2e\00drt_vsprintf\00drt_vsprintf\00\00616\000\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\0012\00clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)\001\00\000\00\000", metadata !20, metadata !21, metadata !21, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !5, i32 0} ; [ DW_TAG_subroutine_type ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 589860, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"putc_mem", metadata !"putc_mem", metadata !"", i32 30, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!8 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x2e\00putc_mem\00putc_mem\00\0030\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !8, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!8 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !9, i32 0} ; [ DW_TAG_subroutine_type ]
 !9 = metadata !{null}
-!10 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_double", metadata !"print_double", metadata !"", i32 203, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_number", metadata !"print_number", metadata !"", i32 75, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!12 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"get_flags", metadata !"get_flags", metadata !"", i32 508, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x2e\00print_double\00print_double\00\00203\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x2e\00print_number\00print_number\00\0075\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !4, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
+!12 = metadata !{metadata !"0x2e\00get_flags\00get_flags\00\00508\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !8, null, null, null, null, null} ; [ DW_TAG_subprogram ]
 !13 = metadata !{i32 653, i32 5, metadata !14, null}
-!14 = metadata !{i32 589835, metadata !20, metadata !15, i32 652, i32 35, i32 2} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{i32 589835, metadata !20, metadata !1, i32 616, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 590080, metadata !17, metadata !"do_tab_convert", metadata !2, i32 853, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!17 = metadata !{i32 589835, metadata !20, metadata !14, i32 850, i32 12, i32 33} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\00652\0035\002", metadata !20, metadata !15} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{metadata !"0xb\00616\001\000", metadata !20, metadata !1} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x100\00do_tab_convert\00853\000", metadata !17, metadata !2, metadata !6} ; [ DW_TAG_auto_variable ]
+!17 = metadata !{metadata !"0xb\00850\0012\0033", metadata !20, metadata !14} ; [ DW_TAG_lexical_block ]
 !18 = metadata !{i32 853, i32 11, metadata !17, null}
 !19 = metadata !{i32 853, i32 29, metadata !17, null}
 !20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"}
diff --git a/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
index 8f99bc3..a83f164 100644
--- a/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
+++ b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
@@ -12,7 +12,7 @@ entry:
 
 for.body:
 ; CHECK: for.body
-; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
 ; CHECK: add x[[REG:[0-9]+]],
 ; CHECK:                      x[[REG]], #1, lsl  #12
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
diff --git a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
index 168e921..7d880f3 100644
--- a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
+++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=arm64 -O0 < %s | FileCheck %s
-; RUN: llc -march=arm64 -O3 < %s | FileCheck %s
+; RUN: llc -march=arm64 -O0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=arm64 -O3 -verify-machineinstrs < %s | FileCheck %s
 
 @.str = private unnamed_addr constant [9 x i8] c"%lf %lu\0A\00", align 1
 @.str1 = private unnamed_addr constant [8 x i8] c"%lf %u\0A\00", align 1
diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index c4597d5..6266d1c 100644
--- a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -1,15 +1,36 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-NOOPT
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPT
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-NOOPT
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-OPT
 
 define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: bar:
 ; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
 ; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; Without advanced copy optimization, we end up with cross register
+; banks copies that cannot be coalesced.
+; CHECK-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
+; With advanced copy optimization, we end up with just one copy
+; to insert the computed high part into the V register. 
+; CHECK-OPT-NOT: fmov
 ; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+; CHECK: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
+; CHECK-NOOPT: fmov d0, [[COPY_REG3]]
+; CHECK-OPT-NOT: fmov
+; CHECK: ins.d v0[1], [[COPY_REG2]]
+; CHECK-NEXT: ret
+;
 ; GENERIC-LABEL: bar:
 ; GENERIC: add	v[[REG:[0-9]+]].2d, v0.2d, v1.2d
 ; GENERIC: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; GENERIC-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
+; GENERIC-OPT-NOT: fmov
 ; GENERIC: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+; GENERIC: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
+; GENERIC-NOOPT: fmov d0, [[COPY_REG3]]
+; GENERIC-OPT-NOT: fmov
+; GENERIC: ins v0.d[1], [[COPY_REG2]]
+; GENERIC-NEXT: ret
   %add = add <2 x i64> %a, %b
   %vgetq_lane = extractelement <2 x i64> %add, i32 0
   %vgetq_lane2 = extractelement <2 x i64> %b, i32 0
@@ -65,3 +86,44 @@ define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
   %retval = bitcast i64 %sub.i to double
   ret double %retval
 }
+define double @and_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: and_su64:
+; CHECK: and.8b v0, v1, v0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: and_su64:
+; GENERIC: and v0.8b, v1.8b, v0.8b
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %or.i = and i64 %vecext1, %vecext
+  %retval = bitcast i64 %or.i to double
+  ret double %retval
+}
+
+define double @orr_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: orr_su64:
+; CHECK: orr.8b v0, v1, v0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: orr_su64:
+; GENERIC: orr v0.8b, v1.8b, v0.8b
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %or.i = or i64 %vecext1, %vecext
+  %retval = bitcast i64 %or.i to double
+  ret double %retval
+}
+
+define double @xorr_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: xorr_su64:
+; CHECK: eor.8b v0, v1, v0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: xorr_su64:
+; GENERIC: eor v0.8b, v1.8b, v0.8b
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %xor.i = xor i64 %vecext1, %vecext
+  %retval = bitcast i64 %xor.i to double
+  ret double %retval
+}
diff --git a/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
index 1b2d543..1bb47fc 100644
--- a/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
+++ b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc -O0 -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs < %s | FileCheck %s
 
 ; The following 2 test cases test shufflevector with beginning UNDEF mask.
 define <8 x i16> @test_vext_undef_traverse(<8 x i16> %in) {
diff --git a/test/CodeGen/AArch64/arm64-aapcs-be.ll b/test/CodeGen/AArch64/arm64-aapcs-be.ll
new file mode 100644
index 0000000..77e2b0f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-aapcs-be.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=aarch64_be-none-eabi -fast-isel=false < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-eabi -fast-isel=true < %s | FileCheck %s
+
+; Check narrow argument passing via stack - callee end
+define i32 @test_narrow_args_callee(i64 %x0, i64 %x1, i64 %x2, i64 %x3, i64 %x4, i64 %x5, i64 %x6, i64 %x7, i8 %c, i16 %s) #0 {
+entry:
+  %conv = zext i8 %c to i32
+  %conv1 = sext i16 %s to i32
+  %add = add nsw i32 %conv1, %conv
+; CHECK-LABEL: test_narrow_args_callee:
+; CHECK-DAG: ldrb w{{[0-9]}}, [sp, #7]
+; CHECK-DAG: ldr{{s?}}h w{{[0-9]}}, [sp, #14]
+  ret i32 %add
+}
+
+; Check narrow argument passing via stack - caller end
+define i32 @test_narrow_args_caller() #0 {
+entry:
+  %call = tail call i32 @test_narrow_args_callee(i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i8 8, i16 9)
+; CHECK-LABEL: test_narrow_args_caller:
+; CHECK-DAG: strh w{{[0-9]}}, [sp, #14]
+; CHECK-DAG: strb w{{[0-9]}}, [sp, #7]
+  ret i32 %call
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
index ccf1371..41c3ad5 100644
--- a/test/CodeGen/AArch64/arm64-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -109,3 +109,45 @@ entry:
 ; CHECK: ldr {{q[0-9]+}}, [sp]
   ret <2 x double> %varg_stack;
 }
+
+; Check that f16 can be passed and returned (ACLE 2.0 extension)
+define half @test_half(float, half %arg) {
+; CHECK-LABEL: test_half:
+; CHECK: mov v0.16b, v1.16b
+  ret half %arg;
+}
+
+; Check that f16 constants are materialized correctly
+define half @test_half_const() {
+; CHECK-LABEL: test_half_const:
+; CHECK: ldr h0, [x{{[0-9]+}}, :lo12:{{.*}}]
+  ret half 0xH4248
+}
+
+; Check that v4f16 can be passed and returned in registers
+define <4 x half> @test_v4_half_register(float, <4 x half> %arg) {
+; CHECK-LABEL: test_v4_half_register:
+; CHECK: mov v0.16b, v1.16b
+  ret <4 x half> %arg;
+}
+
+; Check that v8f16 can be passed and returned in registers
+define <8 x half> @test_v8_half_register(float, <8 x half> %arg) {
+; CHECK-LABEL: test_v8_half_register:
+; CHECK: mov v0.16b, v1.16b
+  ret <8 x half> %arg;
+}
+
+; Check that v4f16 can be passed and returned on the stack
+define <4 x half> @test_v4_half_stack([8 x <2 x double>], <4 x half> %arg) {
+; CHECK-LABEL: test_v4_half_stack:
+; CHECK: ldr d0, [sp]
+  ret <4 x half> %arg;
+}
+
+; Check that v8f16 can be passed and returned on the stack
+define <8 x half> @test_v8_half_stack([8 x <2 x double>], <8 x half> %arg) {
+; CHECK-LABEL: test_v8_half_stack:
+; CHECK: ldr q0, [sp]
+  ret <8 x half> %arg;
+}
diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
index a955029..8a6b64d 100644
--- a/test/CodeGen/AArch64/arm64-abi.ll
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s -debug -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
-; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
-; REQUIRES: asserts
-target triple = "arm64-apple-darwin"
+; RUN: llc     -mtriple=arm64-apple-darwin -mcpu=cyclone -enable-misched=false < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=arm64-apple-darwin                                     < %s | FileCheck --check-prefix=FAST %s
 
 ; rdar://9932559
 define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline {
@@ -42,7 +40,7 @@ entry:
 
 define i32 @i8i16caller() nounwind readnone {
 entry:
-; CHECK: i8i16caller
+; CHECK-LABEL: i8i16caller
 ; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
 ; They are i8, i16, i8 and i8.
 ; CHECK-DAG: strb {{w[0-9]+}}, [sp, #5]
@@ -50,7 +48,7 @@ entry:
 ; CHECK-DAG: strh {{w[0-9]+}}, [sp, #2]
 ; CHECK-DAG: strb {{w[0-9]+}}, [sp]
 ; CHECK: bl
-; FAST: i8i16caller
+; FAST-LABEL: i8i16caller
 ; FAST: strb {{w[0-9]+}}, [sp]
 ; FAST: strh {{w[0-9]+}}, [sp, #2]
 ; FAST: strb {{w[0-9]+}}, [sp, #4]
@@ -64,7 +62,7 @@ entry:
 ; rdar://12651543
 define double @circle_center([2 x float] %a) nounwind ssp {
   %call = tail call double @ext([2 x float] %a) nounwind
-; CHECK: circle_center
+; CHECK-LABEL: circle_center
 ; CHECK: bl
   ret double %call
 }
@@ -75,10 +73,10 @@ declare double @ext([2 x float])
 ; A double argument will be passed on stack, so vecotr should be at sp+16.
 define double @fixed_4i(<4 x i32>* nocapture %in) nounwind {
 entry:
-; CHECK: fixed_4i
+; CHECK-LABEL: fixed_4i
 ; CHECK: str [[REG_1:q[0-9]+]], [sp, #16]
-; FAST: fixed_4i
-; FAST: sub sp, sp, #64
+; FAST-LABEL: fixed_4i
+; FAST: sub sp, sp
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16]
   %0 = load <4 x i32>* %in, align 16
@@ -93,7 +91,7 @@ declare double @args_vec_4i(double, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>,
 define void @test1(float %f1, double %d1, double %d2, double %d3, double %d4,
        double %d5, double %d6, double %d7, double %d8, i32 %i) nounwind ssp {
 entry:
-; CHECK: test1
+; CHECK-LABEL: test1
 ; CHECK: ldr [[REG_1:d[0-9]+]], [sp]
 ; CHECK: scvtf [[REG_2:s[0-9]+]], w0
 ; CHECK: fadd s0, [[REG_2]], s0
@@ -110,7 +108,7 @@ entry:
 define void @test2(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
             i32 %i7, i32 %i8, i32 %i9, float %d1) nounwind ssp {
 entry:
-; CHECK: test2
+; CHECK-LABEL: test2
 ; CHECK: scvtf [[REG_2:s[0-9]+]], w0
 ; CHECK: fadd s0, [[REG_2]], s0
 ; CHECK: ldr [[REG_1:s[0-9]+]], [sp]
@@ -129,9 +127,9 @@ entry:
 ; Check alignment on stack for v64, f64, i64, f32, i32.
 define double @test3(<2 x i32>* nocapture %in) nounwind {
 entry:
-; CHECK: test3
+; CHECK-LABEL: test3
 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
-; FAST: test3
+; FAST-LABEL: test3
 ; FAST: sub sp, sp, #32
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
@@ -146,7 +144,7 @@ declare double @args_vec_2i(double, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>,
 
 define double @test4(double* nocapture %in) nounwind {
 entry:
-; CHECK: test4
+; CHECK-LABEL: test4
 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
 ; CHECK: str [[REG_2:w[0-9]+]], [sp]
 ; CHECK: orr w0, wzr, #0x3
@@ -161,7 +159,7 @@ declare double @args_f64(double, double, double, double, double, double, double,
 
 define i64 @test5(i64* nocapture %in) nounwind {
 entry:
-; CHECK: test5
+; CHECK-LABEL: test5
 ; CHECK: strb [[REG_3:w[0-9]+]], [sp, #16]
 ; CHECK: str [[REG_1:x[0-9]+]], [sp, #8]
 ; CHECK: str [[REG_2:w[0-9]+]], [sp]
@@ -175,7 +173,7 @@ declare i64 @args_i64(i64, i64, i64, i64, i64, i64, i64, i64, i32, i64,
 
 define i32 @test6(float* nocapture %in) nounwind {
 entry:
-; CHECK: test6
+; CHECK-LABEL: test6
 ; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
 ; CHECK: str [[REG_1:s[0-9]+]], [sp, #4]
 ; CHECK: strh [[REG_3:w[0-9]+]], [sp]
@@ -192,7 +190,7 @@ declare i32 @args_f32(i32, i32, i32, i32, i32, i32, i32, i32,
 
 define i32 @test7(i32* nocapture %in) nounwind {
 entry:
-; CHECK: test7
+; CHECK-LABEL: test7
 ; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
 ; CHECK: str [[REG_1:w[0-9]+]], [sp, #4]
 ; CHECK: strh [[REG_3:w[0-9]+]], [sp]
@@ -206,13 +204,13 @@ declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
 
 define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
 entry:
-; CHECK: test8
+; CHECK-LABEL: test8
 ; CHECK: strb {{w[0-9]+}}, [sp, #3]
 ; CHECK: strb wzr, [sp, #2]
 ; CHECK: strb {{w[0-9]+}}, [sp, #1]
 ; CHECK: strb wzr, [sp]
 ; CHECK: bl
-; FAST: test8
+; FAST-LABEL: test8
 ; FAST: strb {{w[0-9]+}}, [sp]
 ; FAST: strb {{w[0-9]+}}, [sp, #1]
 ; FAST: strb {{w[0-9]+}}, [sp, #2]
diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
index 44c5a07..deb740e 100644
--- a/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -34,7 +34,7 @@ target triple = "arm64-apple-darwin"
 ; structs with size < 8 bytes, passed via i64 in x1 and x2
 define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce) #0 {
 entry:
-; CHECK: f38
+; CHECK-LABEL: f38
 ; CHECK: add w[[A:[0-9]+]], w1, w0
 ; CHECK: add {{w[0-9]+}}, w[[A]], w2
   %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce to i32
@@ -56,7 +56,7 @@ entry:
 
 define i32 @caller38() #1 {
 entry:
-; CHECK: caller38
+; CHECK-LABEL: caller38
 ; CHECK: ldr x1,
 ; CHECK: ldr x2,
   %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
@@ -72,7 +72,7 @@ declare i32 @f38_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 ; i9 at [sp]
 define i32 @caller38_stack() #1 {
 entry:
-; CHECK: caller38_stack
+; CHECK-LABEL: caller38_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
 ; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
@@ -87,7 +87,7 @@ entry:
 ; passed via i128 in x1 and x3
 define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
 entry:
-; CHECK: f39
+; CHECK-LABEL: f39
 ; CHECK: add w[[A:[0-9]+]], w1, w0
 ; CHECK: add {{w[0-9]+}}, w[[A]], w3
   %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
@@ -109,7 +109,7 @@ entry:
 
 define i32 @caller39() #1 {
 entry:
-; CHECK: caller39
+; CHECK-LABEL: caller39
 ; CHECK: ldp x1, x2,
 ; CHECK: ldp x3, x4,
   %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
@@ -125,7 +125,7 @@ declare i32 @f39_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 ; passed on stack at [sp+16] and [sp+32]
 define i32 @caller39_stack() #1 {
 entry:
-; CHECK: caller39_stack
+; CHECK-LABEL: caller39_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
 ; CHECK: movz w[[C:[0-9]+]], #0x9
@@ -141,7 +141,7 @@ entry:
 ; passed via i128 in x1 and x3
 define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 {
 entry:
-; CHECK: f40
+; CHECK-LABEL: f40
 ; CHECK: add w[[A:[0-9]+]], w1, w0
 ; CHECK: add {{w[0-9]+}}, w[[A]], w3
   %s1.coerce.fca.0.extract = extractvalue [2 x i64] %s1.coerce, 0
@@ -165,7 +165,7 @@ entry:
 
 define i32 @caller40() #1 {
 entry:
-; CHECK: caller40
+; CHECK-LABEL: caller40
 ; CHECK: ldp x1, x2,
 ; CHECK: ldp x3, x4,
   %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
@@ -181,7 +181,7 @@ declare i32 @f40_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 ; passed on stack at [sp+8] and [sp+24]
 define i32 @caller40_stack() #1 {
 entry:
-; CHECK: caller40_stack
+; CHECK-LABEL: caller40_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
 ; CHECK: movz w[[C:[0-9]+]], #0x9
@@ -197,7 +197,7 @@ entry:
 ; passed via i128 in x1 and x3
 define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
 entry:
-; CHECK: f41
+; CHECK-LABEL: f41
 ; CHECK: add w[[A:[0-9]+]], w1, w0
 ; CHECK: add {{w[0-9]+}}, w[[A]], w3
   %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
@@ -219,7 +219,7 @@ entry:
 
 define i32 @caller41() #1 {
 entry:
-; CHECK: caller41
+; CHECK-LABEL: caller41
 ; CHECK: ldp x1, x2,
 ; CHECK: ldp x3, x4,
   %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
@@ -235,7 +235,7 @@ declare i32 @f41_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 ; passed on stack at [sp+16] and [sp+32]
 define i32 @caller41_stack() #1 {
 entry:
-; CHECK: caller41_stack
+; CHECK-LABEL: caller41_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
 ; CHECK: movz w[[C:[0-9]+]], #0x9
@@ -250,7 +250,7 @@ entry:
 ; structs with size of 22 bytes, passed indirectly in x1 and x2
 define i32 @f42(i32 %i, %struct.s42* nocapture %s1, %struct.s42* nocapture %s2) #2 {
 entry:
-; CHECK: f42
+; CHECK-LABEL: f42
 ; CHECK: ldr w[[A:[0-9]+]], [x1]
 ; CHECK: ldr w[[B:[0-9]+]], [x2]
 ; CHECK: add w[[C:[0-9]+]], w[[A]], w0
@@ -280,7 +280,7 @@ entry:
 ; For s1, we allocate a 22-byte space, pass its address via x1
 define i32 @caller42() #3 {
 entry:
-; CHECK: caller42
+; CHECK-LABEL: caller42
 ; CHECK: str {{x[0-9]+}}, [sp, #48]
 ; CHECK: str {{q[0-9]+}}, [sp, #32]
 ; CHECK: str {{x[0-9]+}}, [sp, #16]
@@ -290,7 +290,7 @@ entry:
 ; Space for s1 is allocated at sp+32
 ; Space for s2 is allocated at sp
 
-; FAST: caller42
+; FAST-LABEL: caller42
 ; FAST: sub sp, sp, #96
 ; Space for s1 is allocated at fp-24 = sp+72
 ; Space for s2 is allocated at sp+48
@@ -316,7 +316,7 @@ declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 
 define i32 @caller42_stack() #3 {
 entry:
-; CHECK: caller42_stack
+; CHECK-LABEL: caller42_stack
 ; CHECK: mov x29, sp
 ; CHECK: sub sp, sp, #96
 ; CHECK: stur {{x[0-9]+}}, [x29, #-16]
@@ -333,7 +333,7 @@ entry:
 ; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
 
-; FAST: caller42_stack
+; FAST-LABEL: caller42_stack
 ; Space for s1 is allocated at fp-24
 ; Space for s2 is allocated at fp-48
 ; FAST: sub x[[A:[0-9]+]], x29, #24
@@ -359,12 +359,12 @@ entry:
 ; passed indirectly in x1 and x2
 define i32 @f43(i32 %i, %struct.s43* nocapture %s1, %struct.s43* nocapture %s2) #2 {
 entry:
-; CHECK: f43
+; CHECK-LABEL: f43
 ; CHECK: ldr w[[A:[0-9]+]], [x1]
 ; CHECK: ldr w[[B:[0-9]+]], [x2]
 ; CHECK: add w[[C:[0-9]+]], w[[A]], w0
 ; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
-; FAST: f43
+; FAST-LABEL: f43
 ; FAST: ldr w[[A:[0-9]+]], [x1]
 ; FAST: ldr w[[B:[0-9]+]], [x2]
 ; FAST: add w[[C:[0-9]+]], w[[A]], w0
@@ -388,7 +388,7 @@ entry:
 
 define i32 @caller43() #3 {
 entry:
-; CHECK: caller43
+; CHECK-LABEL: caller43
 ; CHECK: str {{q[0-9]+}}, [sp, #48]
 ; CHECK: str {{q[0-9]+}}, [sp, #32]
 ; CHECK: str {{q[0-9]+}}, [sp, #16]
@@ -398,7 +398,7 @@ entry:
 ; Space for s1 is allocated at sp+32
 ; Space for s2 is allocated at sp
 
-; FAST: caller43
+; FAST-LABEL: caller43
 ; FAST: mov x29, sp
 ; Space for s1 is allocated at sp+32
 ; Space for s2 is allocated at sp
@@ -428,7 +428,7 @@ declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
 
 define i32 @caller43_stack() #3 {
 entry:
-; CHECK: caller43_stack
+; CHECK-LABEL: caller43_stack
 ; CHECK: mov x29, sp
 ; CHECK: sub sp, sp, #96
 ; CHECK: stur {{q[0-9]+}}, [x29, #-16]
@@ -445,7 +445,7 @@ entry:
 ; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
 
-; FAST: caller43_stack
+; FAST-LABEL: caller43_stack
 ; FAST: sub sp, sp, #96
 ; Space for s1 is allocated at fp-32 = sp+64
 ; Space for s2 is allocated at sp+32
@@ -481,13 +481,13 @@ declare i32 @callee_i128_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
 
 define i32 @i128_split() {
 entry:
-; CHECK: i128_split
+; CHECK-LABEL: i128_split
 ; "i128 %0" should be on stack at [sp].
 ; "i32 8" should be on stack at [sp, #16].
 ; CHECK: str {{w[0-9]+}}, [sp, #16]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp]
-; FAST: i128_split
-; FAST: sub sp, sp, #48
+; FAST-LABEL: i128_split
+; FAST: sub sp, sp
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16]
 ; Load/Store opt is disabled with -O0, so the i128 is split.
@@ -504,14 +504,16 @@ declare i32 @callee_i64(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
 
 define i32 @i64_split() {
 entry:
-; CHECK: i64_split
+; CHECK-LABEL: i64_split
 ; "i64 %0" should be in register x7.
 ; "i32 8" should be on stack at [sp].
 ; CHECK: ldr x7, [{{x[0-9]+}}]
 ; CHECK: str {{w[0-9]+}}, [sp]
-; FAST: i64_split
+; FAST-LABEL: i64_split
 ; FAST: ldr x7, [{{x[0-9]+}}]
-; FAST: str {{w[0-9]+}}, [sp]
+; FAST: mov x[[R0:[0-9]+]], sp
+; FAST: orr w[[R1:[0-9]+]], wzr, #0x8
+; FAST: str w[[R1]], {{\[}}x[[R0]]{{\]}}
   %0 = load i64* bitcast (%struct.s41* @g41 to i64*), align 16
   %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5,
                                     i32 6, i32 7, i64 %0, i32 8) #5
diff --git a/test/CodeGen/AArch64/arm64-addr-mode-folding.ll b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
index 08fb8c9..74bb398 100644
--- a/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
+++ b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple arm64-apple-ios3 -aarch64-gep-opt=false %s -o - | FileCheck %s
 ; <rdar://problem/13621857>
 
 @block = common global i8* null, align 8
diff --git a/test/CodeGen/AArch64/arm64-addrmode.ll b/test/CodeGen/AArch64/arm64-addrmode.ll
index 700fba8..5433a8c 100644
--- a/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -37,9 +37,8 @@ define void @t3() {
 
 ; base + unsigned offset (> imm12 * size of type in bytes)
 ; CHECK: @t4
-; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #8, lsl #12
-; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
+; CHECK: orr w[[NUM:[0-9]+]], wzr, #0x8000
+; CHECK: ldr xzr, [x{{[0-9]+}}, x[[NUM]]]
 ; CHECK: ret
 define void @t4() {
   %incdec.ptr = getelementptr inbounds i64* @object, i64 4096
@@ -60,9 +59,8 @@ define void @t5(i64 %a) {
 ; base + reg + imm
 ; CHECK: @t6
 ; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
-; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #8, lsl #12
-; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
+; CHECK-NEXT: orr w[[NUM:[0-9]+]], wzr, #0x8000
+; CHECK: ldr xzr, [x{{[0-9]+}}, x[[NUM]]]
 ; CHECK: ret
 define void @t6(i64 %a) {
   %tmp1 = getelementptr inbounds i64* @object, i64 %a
@@ -70,3 +68,114 @@ define void @t6(i64 %a) {
   %tmp = load volatile i64* %incdec.ptr, align 8
   ret void
 }
+
+; Test base + wide immediate
+define void @t7(i64 %a) {
+; CHECK-LABEL: t7:
+; CHECK: orr w[[NUM:[0-9]+]], wzr, #0xffff
+; CHECK-NEXT: ldr xzr, [x0, x[[NUM]]]
+  %1 = add i64 %a, 65535   ;0xffff
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t8(i64 %a) {
+; CHECK-LABEL: t8:
+; CHECK: movn [[REG:x[0-9]+]], #0x1235
+; CHECK-NEXT: ldr xzr, [x0, [[REG]]]
+  %1 = sub i64 %a, 4662   ;-4662 is 0xffffffffffffedca
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t9(i64 %a) {
+; CHECK-LABEL: t9:
+; CHECK: movn [[REG:x[0-9]+]], #0x1235, lsl #16
+; CHECK-NEXT: ldr xzr, [x0, [[REG]]]
+  %1 = add i64 -305463297, %a   ;-305463297 is 0xffffffffedcaffff
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t10(i64 %a) {
+; CHECK-LABEL: t10:
+; CHECK: movz [[REG:x[0-9]+]], #0x123, lsl #48
+; CHECK-NEXT: ldr xzr, [x0, [[REG]]]
+  %1 = add i64 %a, 81909218222800896   ;0x123000000000000
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t11(i64 %a) {
+; CHECK-LABEL: t11:
+; CHECK: movz w[[NUM:[0-9]+]], #0x123, lsl #16
+; CHECK: movk w[[NUM:[0-9]+]], #0x4567
+; CHECK-NEXT: ldr xzr, [x0, x[[NUM]]]
+  %1 = add i64 %a, 19088743   ;0x1234567
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+; Test some boundaries that should not use movz/movn/orr
+define void @t12(i64 %a) {
+; CHECK-LABEL: t12:
+; CHECK: add [[REG:x[0-9]+]], x0, #4095
+; CHECK-NEXT: ldr xzr, {{\[}}[[REG]]]
+  %1 = add i64 %a, 4095   ;0xfff
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t13(i64 %a) {
+; CHECK-LABEL: t13:
+; CHECK: sub [[REG:x[0-9]+]], x0, #4095
+; CHECK-NEXT: ldr xzr, {{\[}}[[REG]]]
+  %1 = add i64 %a, -4095   ;-0xfff
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t14(i64 %a) {
+; CHECK-LABEL: t14:
+; CHECK: add [[REG:x[0-9]+]], x0, #291, lsl #12
+; CHECK-NEXT: ldr xzr, {{\[}}[[REG]]]
+  %1 = add i64 %a, 1191936   ;0x123000
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t15(i64 %a) {
+; CHECK-LABEL: t15:
+; CHECK: sub [[REG:x[0-9]+]], x0, #291, lsl #12
+; CHECK-NEXT: ldr xzr, {{\[}}[[REG]]]
+  %1 = add i64 %a, -1191936   ;0xFFFFFFFFFFEDD000
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t16(i64 %a) {
+; CHECK-LABEL: t16:
+; CHECK: ldr xzr, [x0, #28672]
+  %1 = add i64 %a, 28672   ;0x7000
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t17(i64 %a) {
+; CHECK-LABEL: t17:
+; CHECK: ldur xzr, [x0, #-256]
+  %1 = add i64 %a, -256   ;-0x100
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-bcc.ll b/test/CodeGen/AArch64/arm64-bcc.ll
new file mode 100644
index 0000000..138ae90
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-bcc.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin  | FileCheck %s
+; Checks for conditional branch b.vs
+
+; Function Attrs: nounwind
+define i32 @add(i32, i32) {
+entry:
+  %2 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %0, i32 %1)
+  %3 = extractvalue { i32, i1 } %2, 1
+  br i1 %3, label %6, label %4
+
+; <label>:4                                       ; preds = %entry
+  %5 = extractvalue { i32, i1 } %2, 0
+  ret i32 %5
+
+; <label>:6                                       ; preds = %entry
+  tail call void @llvm.trap()
+  unreachable
+; CHECK: b.vs
+}
+
+%S64 = type <{ i64 }>
+%S32 = type <{ i32 }>
+%Sstruct = type <{ %S64, %S32 }>
+
+; Checks for compfail when optimizing csincr-cbz sequence
+
+define { i64, i1 } @foo(i64* , %Sstruct* , i1, i64) {
+entry:
+  %.sroa.0 = alloca i72, align 16
+  %.count.value = getelementptr inbounds %Sstruct* %1, i64 0, i32 0, i32 0
+  %4 = load i64* %.count.value, align 8
+  %.repeatedValue.value = getelementptr inbounds %Sstruct* %1, i64 0, i32 1, i32 0
+  %5 = load i32* %.repeatedValue.value, align 8
+  %6 = icmp eq i64 %4, 0
+  br label %7
+
+; <label>:7                                      ; preds = %entry
+  %.mask58 = and i32 %5, -2048
+  %8 = icmp eq i32 %.mask58, 55296
+  %.not134 = xor i1 %8, true
+  %9 = icmp eq i32 %5, 1114112
+  %or.cond135 = and i1 %9, %.not134
+  br i1 %or.cond135, label %10, label %.loopexit
+
+; <label>:10                                      ; preds = %7
+  %11 = and i32 %5, -2048
+  %12 = icmp eq i32 %11, 55296
+  br i1 %12, label %.loopexit, label %10
+
+
+.loopexit:                                        ; preds = %.entry,%7,%10
+  tail call void @llvm.trap()
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap()
diff --git a/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
index f0e968b..d2985f4 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 define void @test_i64_f64(double* %p, i64* %q) {
diff --git a/test/CodeGen/AArch64/arm64-big-endian-eh.ll b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
index 93e7da9..a51703a 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-eh.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple arm64_be-linux-gnu -filetype obj < %s | llvm-objdump -s - | FileCheck %s
+; RUN: llc -mtriple aarch64_be-linux-gnu -filetype obj < %s | llvm-objdump -s - | FileCheck %s
 
 ; ARM EHABI for big endian
 ; This test case checks whether CIE length record is laid out in big endian format.
diff --git a/test/CodeGen/AArch64/arm64-big-endian-varargs.ll b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
index d7b26b9..db1f48c 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
@@ -3,7 +3,7 @@
 ; Vararg saving must save Q registers using the equivalent of STR/STP.
 
 target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "arm64_be-arm-none-eabi"
+target triple = "aarch64_be-arm-none-eabi"
 
 %struct.__va_list = type { i8*, i8*, i8*, i32, i32 }
 
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
index 1dcccf1..cc9badc 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 define i64 @test_i64_f64(double %p) {
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
index 9a12b7a..d72d0a5 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 declare i64 @test_i64_f64_helper(double %p)
diff --git a/test/CodeGen/AArch64/arm64-cse.ll b/test/CodeGen/AArch64/arm64-cse.ll
index 5d62cfe..b74ece8 100644
--- a/test/CodeGen/AArch64/arm64-cse.ll
+++ b/test/CodeGen/AArch64/arm64-cse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 -aarch64-gep-opt=false | FileCheck %s
 target triple = "arm64-apple-ios"
 
 ; rdar://12462006
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
index 6eed48b..2eb6307 100644
--- a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
@@ -1,8 +1,4 @@
 ; RUN: llc -mcpu=cyclone < %s | FileCheck %s
-
-; r208640 broke ppc64/Linux self-hosting; xfailing while this is worked on.
-; XFAIL: *
-
 target datalayout = "e-i64:64-n32:64-S128"
 target triple = "arm64-apple-ios"
 
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
deleted file mode 100644
index ce132c6..0000000
--- a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc -O3 < %s | FileCheck %s
-; RUN: llc -O3 -addr-sink-using-gep=1 < %s | FileCheck %s
-; Test case for a DAG combiner bug where we combined an indexed load
-; with an extension (sext, zext, or any) into a regular extended load,
-; i.e., dropping the indexed value.
-; <rdar://problem/16389332>
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios"
-
-%class.A = type { i64, i64 }
-%class.C = type { i64 }
-
-; CHECK-LABEL: XX:
-; CHECK: ldr
-define i32 @XX(%class.A* %K, i1 %tst, i32* %addr, %class.C** %ppC, %class.C* %pC) {
-entry:
-  br i1 %tst, label %if.then, label %lor.rhs.i
-
-lor.rhs.i:                                        ; preds = %entry
-  %tmp = load i32* %addr, align 4
-  %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
-  %tmp1 = load i64* %y.i.i.i, align 8
-  %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
-  %div11.i = sdiv i32 %U.sroa.3.8.extract.trunc.i, 17
-  %add12.i = add nsw i32 0, %div11.i
-  %U.sroa.3.12.extract.shift.i = lshr i64 %tmp1, 32
-  %U.sroa.3.12.extract.trunc.i = trunc i64 %U.sroa.3.12.extract.shift.i to i32
-  %div15.i = sdiv i32 %U.sroa.3.12.extract.trunc.i, 13
-  %add16.i = add nsw i32 %add12.i, %div15.i
-  %rem.i.i = srem i32 %add16.i, %tmp
-  %idxprom = sext i32 %rem.i.i to i64
-  %arrayidx = getelementptr inbounds %class.C** %ppC, i64 %idxprom
-  %tobool533 = icmp eq %class.C* %pC, null
-  br i1 %tobool533, label %while.end, label %while.body
-
-if.then:                                          ; preds = %entry
-  ret i32 42
-
-while.body:                                       ; preds = %lor.rhs.i
-  ret i32 5
-
-while.end:                                        ; preds = %lor.rhs.i
-  %tmp3 = load %class.C** %arrayidx, align 8
-  ret i32 50
-}
diff --git a/test/CodeGen/AArch64/arm64-extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll
index a239403..06bd927 100644
--- a/test/CodeGen/AArch64/arm64-extern-weak.ll
+++ b/test/CodeGen/AArch64/arm64-extern-weak.ll
@@ -1,16 +1,23 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -o - < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK-STATIC %s
 ; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
 
 define i32()* @foo() {
 ; The usual ADRP/ADD pair can't be used for a weak reference because it must
-; evaluate to 0 if the symbol is undefined. We use a litpool entry.
+; evaluate to 0 if the symbol is undefined. We use a GOT entry for PIC
+; otherwise a litpool entry.
   ret i32()* @var
 
 ; CHECK: adrp x[[VAR:[0-9]+]], :got:var
 ; CHECK: ldr x0, [x[[VAR]], :got_lo12:var]
 
+; CHECK-STATIC: .LCPI0_0:
+; CHECK-STATIC-NEXT: .xword  var
+; CHECK-STATIC: adrp x[[VAR:[0-9]+]], .LCPI0_0
+; CHECK-STATIC: ldr x0, [x[[VAR]], :lo12:.LCPI0_0]
+
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
 ; CHECK-LARGE: movz x0, #:abs_g3:var
@@ -29,6 +36,11 @@ define i32* @bar() {
 ; CHECK: add x0, [[ARR_VAR]], #20
   ret i32* %addr
 
+; CHECK-STATIC: .LCPI1_0:
+; CHECK-STATIC-NEXT: .xword arr_var
+; CHECK-STATIC: ldr [[BASE:x[0-9]+]], [{{x[0-9]+}}, :lo12:.LCPI1_0]
+; CHECK-STATIC: add x0, [[BASE]], #20
+
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
 ; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g3:arr_var
@@ -44,6 +56,9 @@ define i32* @wibble() {
 ; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
 ; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
 
+; CHECK-STATIC: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK-STATIC: add x0, [[BASE]], :lo12:defined_weak_var
+
 ; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
index ebd847e..d81bc7c 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 @sortlist = common global [5001 x i32] zeroinitializer, align 16
 @sortlist2 = common global [5001 x i64] zeroinitializer, align 16
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
index 1706e9e..a841702 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
@@ -1,5 +1,5 @@
 ; This test should cause the TargetMaterializeAlloca to be invoked
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 %struct.S1Ty = type { i64 }
 %struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty }
@@ -15,9 +15,8 @@ define void @main() nounwind {
 entry:
 ; CHECK: main
 ; CHECK: mov x29, sp
-; CHECK: mov x[[REG:[0-9]+]], sp
-; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8
-; CHECK-NEXT: add x0, x[[REG]], x[[REG1]]
+; CHECK: mov [[REG:x[0-9]+]], sp
+; CHECK-NEXT: add x0, [[REG]], #8
   %E = alloca %struct.S2Ty, align 4
   %B = getelementptr inbounds %struct.S2Ty* %E, i32 0, i32 1
   call void @takeS1(%struct.S1Ty* %B)
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
index 37a8295..f896d85 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-br.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone -verify-machineinstrs < %s | FileCheck %s
 
 define void @branch1() nounwind uwtable ssp {
   %x = alloca i32, align 4
@@ -95,7 +95,7 @@ entry:
   store i64 %d, i64* %d.addr, align 8
   %0 = load i16* %b.addr, align 2
 ; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
+; CHECK: cmp w0, #0
 ; CHECK: b.eq LBB4_2
   %conv = trunc i16 %0 to i1
   br i1 %conv, label %if.then, label %if.end
@@ -107,7 +107,7 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %if.then, %entry
   %1 = load i32* %c.addr, align 4
 ; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
-; CHECK: subs w{{[0-9]+}}, w[[REG]], #0
+; CHECK: cmp w[[REG]], #0
 ; CHECK: b.eq LBB4_4
   %conv1 = trunc i32 %1 to i1
   br i1 %conv1, label %if.then3, label %if.end4
@@ -118,7 +118,7 @@ if.then3:                                         ; preds = %if.end
 
 if.end4:                                          ; preds = %if.then3, %if.end
   %2 = load i64* %d.addr, align 8
-; CHECK: subs w{{[0-9]+}}, w{{[0-9]+}}, #0
+; CHECK: cmp w{{[0-9]+}}, #0
 ; CHECK: b.eq LBB4_6
   %conv5 = trunc i64 %2 to i1
   br i1 %conv5, label %if.then7, label %if.end8
@@ -137,11 +137,10 @@ declare void @foo1()
 ; rdar://15174028
 define i32 @trunc64(i64 %foo) nounwind {
 ; CHECK: trunc64
-; CHECK: orr  [[REG:x[0-9]+]], xzr, #0x1
-; CHECK: and  [[REG2:x[0-9]+]], x0, [[REG]]
-; CHECK: mov  x[[REG3:[0-9]+]], [[REG2]]
-; CHECK: and  [[REG4:w[0-9]+]], w[[REG3]], #0x1
-; CHECK: subs {{w[0-9]+}}, [[REG4]], #0
+; CHECK: and  [[REG1:x[0-9]+]], x0, #0x1
+; CHECK: mov  x[[REG2:[0-9]+]], [[REG1]]
+; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0x1
+; CHECK: cmp  [[REG3]], #0
 ; CHECK: b.eq LBB5_2
   %a = and i64 %foo, 1
   %b = trunc i64 %a to i1
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-call.ll b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
index 8d756ae..f1e2c40 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-call.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64_be-linux-gnu | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -O0 -fast-isel-abort -fast-isel-abort-args -code-model=small -verify-machineinstrs -mtriple=arm64-apple-darwin   < %s | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -fast-isel-abort-args -code-model=large -verify-machineinstrs -mtriple=arm64-apple-darwin   < %s | FileCheck %s --check-prefix=LARGE
+; RUN: llc -O0 -fast-isel-abort -fast-isel-abort-args -code-model=small -verify-machineinstrs -mtriple=aarch64_be-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-BE
 
 define void @call0() nounwind {
 entry:
@@ -8,8 +9,12 @@ entry:
 
 define void @foo0() nounwind {
 entry:
-; CHECK: foo0
-; CHECK: bl _call0
+; CHECK-LABEL: foo0
+; CHECK:       bl _call0
+; LARGE-LABEL: foo0
+; LARGE:       adrp [[REG0:x[0-9]+]], _call0@GOTPAGE
+; LARGE:       ldr  [[REG1:x[0-9]+]], {{\[}}[[REG0]], _call0@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr  [[REG1]]
   call void @call0()
   ret void
 }
@@ -24,10 +29,10 @@ entry:
 
 define i32 @foo1(i32 %a) nounwind {
 entry:
-; CHECK: foo1
-; CHECK: stur w0, [x29, #-4]
-; CHECK-NEXT: ldur w0, [x29, #-4]
-; CHECK-NEXT: bl _call1
+; CHECK-LABEL: foo1
+; CHECK:       stur w0, [x29, #-4]
+; CHECK-NEXT:  ldur w0, [x29, #-4]
+; CHECK-NEXT:  bl _call1
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
   %tmp = load i32* %a.addr, align 4
@@ -37,10 +42,10 @@ entry:
 
 define i32 @sext_(i8 %a, i16 %b) nounwind {
 entry:
-; CHECK: @sext_
-; CHECK: sxtb w0, w0
-; CHECK: sxth w1, w1
-; CHECK: bl _foo_sext_
+; CHECK-LABEL: sext_
+; CHECK:       sxtb w0, w0
+; CHECK:       sxth w1, w1
+; CHECK:       bl _foo_sext_
   call void @foo_sext_(i8 signext %a, i16 signext %b)
   ret i32 0
 }
@@ -49,9 +54,9 @@ declare void @foo_sext_(i8 %a, i16 %b)
 
 define i32 @zext_(i8 %a, i16 %b) nounwind {
 entry:
-; CHECK: @zext_
-; CHECK: uxtb w0, w0
-; CHECK: uxth w1, w1
+; CHECK-LABEL: zext_
+; CHECK:       uxtb w0, w0
+; CHECK:       uxth w1, w1
   call void @foo_zext_(i8 zeroext %a, i16 zeroext %b)
   ret i32 0
 }
@@ -60,10 +65,10 @@ declare void @foo_zext_(i8 %a, i16 %b)
 
 define i32 @t1(i32 %argc, i8** nocapture %argv) {
 entry:
-; CHECK: @t1
+; CHECK-LABEL: @t1
 ; The last parameter will be passed on stack via i8.
-; CHECK: strb w{{[0-9]+}}, [sp]
-; CHECK-NEXT: bl _bar
+; CHECK:       strb w{{[0-9]+}}, [sp]
+; CHECK:       bl _bar
   %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70, i8 zeroext 28, i8 zeroext 39, i8 zeroext -41)
   ret i32 0
 }
@@ -73,18 +78,19 @@ declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8
 ; Test materialization of integers.  Target-independent selector handles this.
 define i32 @t2() {
 entry:
-; CHECK: @t2
-; CHECK: movz x0, #0
-; CHECK: orr w1, wzr, #0xfffffff8
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x3ff
-; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x2
-; CHECK: movz w[[REG3:[0-9]+]], #0
-; CHECK: orr w[[REG4:[0-9]+]], wzr, #0x1
-; CHECK: uxth w2, w[[REG]]
-; CHECK: sxtb w3, w[[REG2]]
-; CHECK: and w4, w[[REG3]], #0x1
-; CHECK: and w5, w[[REG4]], #0x1
-; CHECK: bl	_func2
+; CHECK-LABEL: t2
+; CHECK:       mov [[REG1:x[0-9]+]], xzr
+; CHECK:       orr w1, wzr, #0xfffffff8
+; CHECK:       orr [[REG2:w[0-9]+]], wzr, #0x3ff
+; CHECK:       orr [[REG3:w[0-9]+]], wzr, #0x2
+; CHECK:       mov [[REG4:w[0-9]+]], wzr
+; CHECK:       orr [[REG5:w[0-9]+]], wzr, #0x1
+; CHECK:       mov x0, [[REG1]]
+; CHECK:       uxth w2, [[REG2]]
+; CHECK:       sxtb w3, [[REG3]]
+; CHECK:       and w4, [[REG4]], #0x1
+; CHECK:       and w5, [[REG5]], #0x1
+; CHECK:       bl _func2
   %call = call i32 @func2(i64 zeroext 0, i32 signext -8, i16 zeroext 1023, i8 signext -254, i1 zeroext 0, i1 zeroext 1)
   ret i32 0
 }
@@ -94,7 +100,170 @@ declare i32 @func2(i64 zeroext, i32 signext, i16 zeroext, i8 signext, i1 zeroext
 declare void @callee_b0f(i8 %bp10, i8 %bp11, i8 %bp12, i8 %bp13, i8 %bp14, i8 %bp15, i8 %bp17, i8 %bp18, i8 %bp19)
 define void @caller_b1f() {
 entry:
-  ; CHECK-BE: strb w{{.*}}, [sp, #7]
+; CHECK-BE-LABEL: caller_b1f
+; CHECK-BE:       strb w{{.*}}, [sp, #7]
   call void @callee_b0f(i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 42)
   ret void
 }
+
+define zeroext i1 @call_arguments1(i1 %a1, i1 %a2, i1 %a3, i1 %a4, i1 %a5, i1 %a6, i1 %a7, i1 %a8) {
+; CHECK-LABEL: call_arguments1
+; CHECK:       and {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, w2, w3
+; CHECK-NEXT:  and {{w[0-9]+}}, w4, w5
+; CHECK-NEXT:  and {{w[0-9]+}}, w6, w7
+  %1 = and i1 %a1, %a2
+  %2 = and i1 %a3, %a4
+  %3 = and i1 %a5, %a6
+  %4 = and i1 %a7, %a8
+  %5 = and i1 %1, %2
+  %6 = and i1 %3, %4
+  %7 = and i1 %5, %6
+  ret i1 %7
+}
+
+define i32 @call_arguments2(i8 zeroext %a1, i8 zeroext %a2, i8 zeroext %a3, i8 zeroext %a4, i8 signext %a5, i8 signext %a6, i8 signext %a7, i8 signext %a8) {
+; CHECK-LABEL: call_arguments2
+; CHECK:       add {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:  add {{w[0-9]+}}, w2, w3
+; CHECK-NEXT:  add {{w[0-9]+}}, w4, w5
+; CHECK-NEXT:  add {{w[0-9]+}}, w6, w7
+  %a1z = zext i8 %a1 to i32
+  %a2z = zext i8 %a2 to i32
+  %a3z = zext i8 %a3 to i32
+  %a4z = zext i8 %a4 to i32
+  %a5s = sext i8 %a5 to i32
+  %a6s = sext i8 %a6 to i32
+  %a7s = sext i8 %a7 to i32
+  %a8s = sext i8 %a8 to i32
+  %1 = add i32 %a1z, %a2z
+  %2 = add i32 %a3z, %a4z
+  %3 = add i32 %a5s, %a6s
+  %4 = add i32 %a7s, %a8s
+  %5 = add i32 %1, %2
+  %6 = add i32 %3, %4
+  %7 = add i32 %5, %6
+  ret i32 %7
+}
+
+define i32 @call_arguments3(i16 zeroext %a1, i16 zeroext %a2, i16 zeroext %a3, i16 zeroext %a4, i16 signext %a5, i16 signext %a6, i16 signext %a7, i16 signext %a8) {
+; CHECK-LABEL: call_arguments3
+; CHECK:       add {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:  add {{w[0-9]+}}, w2, w3
+; CHECK-NEXT:  add {{w[0-9]+}}, w4, w5
+; CHECK-NEXT:  add {{w[0-9]+}}, w6, w7
+  %a1z = zext i16 %a1 to i32
+  %a2z = zext i16 %a2 to i32
+  %a3z = zext i16 %a3 to i32
+  %a4z = zext i16 %a4 to i32
+  %a5s = sext i16 %a5 to i32
+  %a6s = sext i16 %a6 to i32
+  %a7s = sext i16 %a7 to i32
+  %a8s = sext i16 %a8 to i32
+  %1 = add i32 %a1z, %a2z
+  %2 = add i32 %a3z, %a4z
+  %3 = add i32 %a5s, %a6s
+  %4 = add i32 %a7s, %a8s
+  %5 = add i32 %1, %2
+  %6 = add i32 %3, %4
+  %7 = add i32 %5, %6
+  ret i32 %7
+}
+
+define i32 @call_arguments4(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
+; CHECK-LABEL: call_arguments4
+; CHECK:       add {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:  add {{w[0-9]+}}, w2, w3
+; CHECK-NEXT:  add {{w[0-9]+}}, w4, w5
+; CHECK-NEXT:  add {{w[0-9]+}}, w6, w7
+  %1 = add i32 %a1, %a2
+  %2 = add i32 %a3, %a4
+  %3 = add i32 %a5, %a6
+  %4 = add i32 %a7, %a8
+  %5 = add i32 %1, %2
+  %6 = add i32 %3, %4
+  %7 = add i32 %5, %6
+  ret i32 %7
+}
+
+define i64 @call_arguments5(i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8) {
+; CHECK-LABEL: call_arguments5
+; CHECK:       add {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:  add {{x[0-9]+}}, x2, x3
+; CHECK-NEXT:  add {{x[0-9]+}}, x4, x5
+; CHECK-NEXT:  add {{x[0-9]+}}, x6, x7
+  %1 = add i64 %a1, %a2
+  %2 = add i64 %a3, %a4
+  %3 = add i64 %a5, %a6
+  %4 = add i64 %a7, %a8
+  %5 = add i64 %1, %2
+  %6 = add i64 %3, %4
+  %7 = add i64 %5, %6
+  ret i64 %7
+}
+
+define float @call_arguments6(float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7, float %a8) {
+; CHECK-LABEL: call_arguments6
+; CHECK:       fadd {{s[0-9]+}}, s0, s1
+; CHECK-NEXT:  fadd {{s[0-9]+}}, s2, s3
+; CHECK-NEXT:  fadd {{s[0-9]+}}, s4, s5
+; CHECK-NEXT:  fadd {{s[0-9]+}}, s6, s7
+  %1 = fadd float %a1, %a2
+  %2 = fadd float %a3, %a4
+  %3 = fadd float %a5, %a6
+  %4 = fadd float %a7, %a8
+  %5 = fadd float %1, %2
+  %6 = fadd float %3, %4
+  %7 = fadd float %5, %6
+  ret float %7
+}
+
+define double @call_arguments7(double %a1, double %a2, double %a3, double %a4, double %a5, double %a6, double %a7, double %a8) {
+; CHECK-LABEL: call_arguments7
+; CHECK:       fadd {{d[0-9]+}}, d0, d1
+; CHECK-NEXT:  fadd {{d[0-9]+}}, d2, d3
+; CHECK-NEXT:  fadd {{d[0-9]+}}, d4, d5
+; CHECK-NEXT:  fadd {{d[0-9]+}}, d6, d7
+  %1 = fadd double %a1, %a2
+  %2 = fadd double %a3, %a4
+  %3 = fadd double %a5, %a6
+  %4 = fadd double %a7, %a8
+  %5 = fadd double %1, %2
+  %6 = fadd double %3, %4
+  %7 = fadd double %5, %6
+  ret double %7
+}
+
+define i64 @call_arguments8(i32 %a1, i64 %a2, i32 %a3, i64 %a4) {
+; CHECK-LABEL: call_arguments8
+; CHECK:       ubfx  [[REG1:x[0-9]+]], {{x[0-9]+}}, #0, #32
+; CHECK:       ubfx  [[REG2:x[0-9]+]], {{x[0-9]+}}, #0, #32
+; CHECK:       add {{x[0-9]+}}, [[REG1]], x1
+; CHECK-NEXT:  add {{x[0-9]+}}, [[REG2]], x3
+  %aa1 = zext i32 %a1 to i64
+  %aa3 = zext i32 %a3 to i64
+  %1 = add i64 %aa1, %a2
+  %2 = add i64 %aa3, %a4
+  %3 = add i64 %1, %2
+  ret i64 %3
+}
+
+define void @call_arguments9(i8 %a1, i16 %a2, i32 %a3, i64 %a4, float %a5, double %a6, i64 %a7, double %a8) {
+; CHECK-LABEL: call_arguments9
+  ret void
+}
+
+; Test that we use the correct register class for the branch.
+define void @call_blr(i64 %Fn, i1 %c) {
+; CHECK-LABEL: call_blr
+; CHECK:       blr
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  %1 = inttoptr i64 %Fn to void (i64)*
+  br label %bb2
+bb2:
+  %2 = phi void (i64)* [ %1, %bb1 ], [ undef, %0 ]
+  call void %2(i64 1)
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
index c5417de..e515184 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin -mcpu=cyclone < %s | FileCheck %s
 
 ;; Test various conversions.
 define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
 entry:
-; CHECK: trunc_
+; CHECK-LABEL: trunc_
 ; CHECK: sub sp, sp, #16
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: strh w1, [sp, #12]
@@ -17,7 +17,6 @@ entry:
 ; CHECK: ldrh w0, [sp, #12]
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: ldrb w0, [sp, #15]
-; CHECK: uxtb w0, w0
 ; CHECK: add sp, sp, #16
 ; CHECK: ret
   %a.addr = alloca i8, align 1
@@ -44,21 +43,18 @@ entry:
 
 define i64 @zext_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
 entry:
-; CHECK: zext_
+; CHECK-LABEL: zext_
 ; CHECK: sub sp, sp, #16
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: strh w1, [sp, #12]
 ; CHECK: str w2, [sp, #8]
 ; CHECK: str x3, [sp]
 ; CHECK: ldrb w0, [sp, #15]
-; CHECK: uxtb w0, w0
 ; CHECK: strh w0, [sp, #12]
 ; CHECK: ldrh w0, [sp, #12]
-; CHECK: uxth w0, w0
 ; CHECK: str w0, [sp, #8]
 ; CHECK: ldr w0, [sp, #8]
 ; CHECK: mov x3, x0
-; CHECK: ubfx x3, x3, #0, #32
 ; CHECK: str x3, [sp]
 ; CHECK: ldr x0, [sp]
 ; CHECK: ret
@@ -85,37 +81,35 @@ entry:
 
 define i32 @zext_i1_i32(i1 zeroext %a) nounwind ssp {
 entry:
-; CHECK: @zext_i1_i32
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: zext_i1_i32
+; CHECK-NOT:   and w0, w0, #0x1
+; CHECK:       ret
   %conv = zext i1 %a to i32
   ret i32 %conv;
 }
 
 define i64 @zext_i1_i64(i1 zeroext %a) nounwind ssp {
 entry:
-; CHECK: @zext_i1_i64
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: zext_i1_i64
+; CHECK-NOT:   and w0, w0, #0x1
+; CHECK:       ret
   %conv = zext i1 %a to i64
   ret i64 %conv;
 }
 
 define i64 @sext_(i8 signext %a, i16 signext %b, i32 %c, i64 %d) nounwind ssp {
 entry:
-; CHECK: sext_
+; CHECK-LABEL: sext_
 ; CHECK: sub sp, sp, #16
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: strh w1, [sp, #12]
 ; CHECK: str w2, [sp, #8]
 ; CHECK: str x3, [sp]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: sxtb w0, w0
+; CHECK: ldrsb w0, [sp, #15]
 ; CHECK: strh w0, [sp, #12]
-; CHECK: ldrh w0, [sp, #12]
-; CHECK: sxth w0, w0
+; CHECK: ldrsh w0, [sp, #12]
 ; CHECK: str w0, [sp, #8]
-; CHECK: ldr w0, [sp, #8]
-; CHECK: mov x3, x0
-; CHECK: sxtw x3, w3
+; CHECK: ldrsw x3, [sp, #8]
 ; CHECK: str x3, [sp]
 ; CHECK: ldr x0, [sp]
 ; CHECK: ret
@@ -161,8 +155,9 @@ define zeroext i64 @sext_i16_i64(i16 zeroext %in) {
 ; Test sext i1 to i32
 define i32 @sext_i1_i32(i1 signext %a) nounwind ssp {
 entry:
-; CHECK: sext_i1_i32
-; CHECK: sbfx w0, w0, #0, #1
+; CHECK-LABEL: sext_i1_i32
+; CHECK-NOT:   sbfx w0, w0, #0, #1
+; CHECK:       ret
   %conv = sext i1 %a to i32
   ret i32 %conv
 }
@@ -170,7 +165,7 @@ entry:
 ; Test sext i1 to i16
 define signext i16 @sext_i1_i16(i1 %a) nounwind ssp {
 entry:
-; CHECK: sext_i1_i16
+; CHECK-LABEL: sext_i1_i16
 ; CHECK: sbfx w0, w0, #0, #1
   %conv = sext i1 %a to i16
   ret i16 %conv
@@ -179,7 +174,7 @@ entry:
 ; Test sext i1 to i8
 define signext i8 @sext_i1_i8(i1 %a) nounwind ssp {
 entry:
-; CHECK: sext_i1_i8
+; CHECK-LABEL: sext_i1_i8
 ; CHECK: sbfx w0, w0, #0, #1
   %conv = sext i1 %a to i8
   ret i8 %conv
@@ -188,7 +183,7 @@ entry:
 ; Test fpext
 define double @fpext_(float %a) nounwind ssp {
 entry:
-; CHECK: fpext_
+; CHECK-LABEL: fpext_
 ; CHECK: fcvt d0, s0
   %conv = fpext float %a to double
   ret double %conv
@@ -197,7 +192,7 @@ entry:
 ; Test fptrunc
 define float @fptrunc_(double %a) nounwind ssp {
 entry:
-; CHECK: fptrunc_
+; CHECK-LABEL: fptrunc_
 ; CHECK: fcvt s0, d0
   %conv = fptrunc double %a to float
   ret float %conv
@@ -206,7 +201,7 @@ entry:
 ; Test fptosi
 define i32 @fptosi_ws(float %a) nounwind ssp {
 entry:
-; CHECK: fptosi_ws
+; CHECK-LABEL: fptosi_ws
 ; CHECK: fcvtzs w0, s0
   %conv = fptosi float %a to i32
   ret i32 %conv
@@ -215,7 +210,7 @@ entry:
 ; Test fptosi
 define i32 @fptosi_wd(double %a) nounwind ssp {
 entry:
-; CHECK: fptosi_wd
+; CHECK-LABEL: fptosi_wd
 ; CHECK: fcvtzs w0, d0
   %conv = fptosi double %a to i32
   ret i32 %conv
@@ -224,7 +219,7 @@ entry:
 ; Test fptoui
 define i32 @fptoui_ws(float %a) nounwind ssp {
 entry:
-; CHECK: fptoui_ws
+; CHECK-LABEL: fptoui_ws
 ; CHECK: fcvtzu w0, s0
   %conv = fptoui float %a to i32
   ret i32 %conv
@@ -233,7 +228,7 @@ entry:
 ; Test fptoui
 define i32 @fptoui_wd(double %a) nounwind ssp {
 entry:
-; CHECK: fptoui_wd
+; CHECK-LABEL: fptoui_wd
 ; CHECK: fcvtzu w0, d0
   %conv = fptoui double %a to i32
   ret i32 %conv
@@ -242,7 +237,7 @@ entry:
 ; Test sitofp
 define float @sitofp_sw_i1(i1 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sw_i1
+; CHECK-LABEL: sitofp_sw_i1
 ; CHECK: sbfx w0, w0, #0, #1
 ; CHECK: scvtf s0, w0
   %conv = sitofp i1 %a to float
@@ -252,7 +247,7 @@ entry:
 ; Test sitofp
 define float @sitofp_sw_i8(i8 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sw_i8
+; CHECK-LABEL: sitofp_sw_i8
 ; CHECK: sxtb w0, w0
 ; CHECK: scvtf s0, w0
   %conv = sitofp i8 %a to float
@@ -262,9 +257,7 @@ entry:
 ; Test sitofp
 define float @sitofp_sw_i16(i16 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sw_i16
-; CHECK: sxth w0, w0
-; CHECK: scvtf s0, w0
+; CHECK-LABEL: sitofp_sw_i16
   %conv = sitofp i16 %a to float
   ret float %conv
 }
@@ -272,7 +265,7 @@ entry:
 ; Test sitofp
 define float @sitofp_sw(i32 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sw
+; CHECK-LABEL: sitofp_sw
 ; CHECK: scvtf s0, w0
   %conv = sitofp i32 %a to float
   ret float %conv
@@ -281,7 +274,7 @@ entry:
 ; Test sitofp
 define float @sitofp_sx(i64 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sx
+; CHECK-LABEL: sitofp_sx
 ; CHECK: scvtf s0, x0
   %conv = sitofp i64 %a to float
   ret float %conv
@@ -290,7 +283,7 @@ entry:
 ; Test sitofp
 define double @sitofp_dw(i32 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_dw
+; CHECK-LABEL: sitofp_dw
 ; CHECK: scvtf d0, w0
   %conv = sitofp i32 %a to double
   ret double %conv
@@ -299,7 +292,7 @@ entry:
 ; Test sitofp
 define double @sitofp_dx(i64 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_dx
+; CHECK-LABEL: sitofp_dx
 ; CHECK: scvtf d0, x0
   %conv = sitofp i64 %a to double
   ret double %conv
@@ -308,7 +301,7 @@ entry:
 ; Test uitofp
 define float @uitofp_sw_i1(i1 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sw_i1
+; CHECK-LABEL: uitofp_sw_i1
 ; CHECK: and w0, w0, #0x1
 ; CHECK: ucvtf s0, w0
   %conv = uitofp i1 %a to float
@@ -318,9 +311,7 @@ entry:
 ; Test uitofp
 define float @uitofp_sw_i8(i8 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sw_i8
-; CHECK: uxtb w0, w0
-; CHECK: ucvtf s0, w0
+; CHECK-LABEL: uitofp_sw_i8
   %conv = uitofp i8 %a to float
   ret float %conv
 }
@@ -328,9 +319,7 @@ entry:
 ; Test uitofp
 define float @uitofp_sw_i16(i16 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sw_i16
-; CHECK: uxth w0, w0
-; CHECK: ucvtf s0, w0
+; CHECK-LABEL: uitofp_sw_i16
   %conv = uitofp i16 %a to float
   ret float %conv
 }
@@ -338,7 +327,7 @@ entry:
 ; Test uitofp
 define float @uitofp_sw(i32 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sw
+; CHECK-LABEL: uitofp_sw
 ; CHECK: ucvtf s0, w0
   %conv = uitofp i32 %a to float
   ret float %conv
@@ -347,7 +336,7 @@ entry:
 ; Test uitofp
 define float @uitofp_sx(i64 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sx
+; CHECK-LABEL: uitofp_sx
 ; CHECK: ucvtf s0, x0
   %conv = uitofp i64 %a to float
   ret float %conv
@@ -356,7 +345,7 @@ entry:
 ; Test uitofp
 define double @uitofp_dw(i32 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_dw
+; CHECK-LABEL: uitofp_dw
 ; CHECK: ucvtf d0, w0
   %conv = uitofp i32 %a to double
   ret double %conv
@@ -365,7 +354,7 @@ entry:
 ; Test uitofp
 define double @uitofp_dx(i64 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_dx
+; CHECK-LABEL: uitofp_dx
 ; CHECK: ucvtf d0, x0
   %conv = uitofp i64 %a to double
   ret double %conv
@@ -373,7 +362,7 @@ entry:
 
 define i32 @i64_trunc_i32(i64 %a) nounwind ssp {
 entry:
-; CHECK: i64_trunc_i32
+; CHECK-LABEL: i64_trunc_i32
 ; CHECK: mov x1, x0
   %conv = trunc i64 %a to i32
   ret i32 %conv
@@ -381,7 +370,7 @@ entry:
 
 define zeroext i16 @i64_trunc_i16(i64 %a) nounwind ssp {
 entry:
-; CHECK: i64_trunc_i16
+; CHECK-LABEL: i64_trunc_i16
 ; CHECK: mov x[[REG:[0-9]+]], x0
 ; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xffff
 ; CHECK: uxth w0, [[REG2]]
@@ -391,7 +380,7 @@ entry:
 
 define zeroext i8 @i64_trunc_i8(i64 %a) nounwind ssp {
 entry:
-; CHECK: i64_trunc_i8
+; CHECK-LABEL: i64_trunc_i8
 ; CHECK: mov x[[REG:[0-9]+]], x0
 ; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xff
 ; CHECK: uxtb w0, [[REG2]]
@@ -401,7 +390,7 @@ entry:
 
 define zeroext i1 @i64_trunc_i1(i64 %a) nounwind ssp {
 entry:
-; CHECK: i64_trunc_i1
+; CHECK-LABEL: i64_trunc_i1
 ; CHECK: mov x[[REG:[0-9]+]], x0
 ; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0x1
 ; CHECK: and w0, [[REG2]], #0x1
@@ -411,7 +400,7 @@ entry:
 
 ; rdar://15101939
 define void @stack_trunc() nounwind {
-; CHECK: stack_trunc
+; CHECK-LABEL: stack_trunc
 ; CHECK: sub  sp, sp, #16
 ; CHECK: ldr  [[REG:x[0-9]+]], [sp]
 ; CHECK: mov  x[[REG2:[0-9]+]], [[REG]]
@@ -428,15 +417,36 @@ define void @stack_trunc() nounwind {
 
 define zeroext i64 @zext_i8_i64(i8 zeroext %in) {
 ; CHECK-LABEL: zext_i8_i64:
-; CHECK: mov x[[TMP:[0-9]+]], x0
-; CHECK: ubfx x0, x[[TMP]], #0, #8
+; CHECK-NOT:   ubfx x0, {{x[0-9]+}}, #0, #8
+; CHECK:       ret
   %big = zext i8 %in to i64
   ret i64 %big
 }
 define zeroext i64 @zext_i16_i64(i16 zeroext %in) {
 ; CHECK-LABEL: zext_i16_i64:
-; CHECK: mov x[[TMP:[0-9]+]], x0
-; CHECK: ubfx x0, x[[TMP]], #0, #16
+; CHECK-NOT:   ubfx x0, {{x[0-9]+}}, #0, #16
+; CHECK:       ret
   %big = zext i16 %in to i64
   ret i64 %big
 }
+
+define float @bitcast_i32_to_float(i32 %a) {
+  %1 = bitcast i32 %a to float
+  ret float %1
+}
+
+define double @bitcast_i64_to_double(i64 %a) {
+  %1 = bitcast i64 %a to double
+  ret double %1
+}
+
+define i32 @bitcast_float_to_i32(float %a) {
+  %1 = bitcast float %a to i32
+  ret i32 %1
+}
+
+define i64 @bitcast_double_to_i64(double %a) {
+  %1 = bitcast double %a to i64
+  ret i64 %1
+}
+
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
index f030596..111b6bd 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
@@ -1,146 +1,162 @@
-; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
-define zeroext i1 @fcmp_float1(float %a) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_float1
-; CHECK: fcmp s0, #0.0
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une float %a, 0.000000e+00
-  ret i1 %cmp
+define zeroext i1 @fcmp_float1(float %a) {
+; CHECK-LABEL: fcmp_float1
+; CHECK:       fcmp s0, #0.0
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une float %a, 0.000000e+00
+  ret i1 %1
 }
 
-define zeroext i1 @fcmp_float2(float %a, float %b) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_float2
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une float %a, %b
-  ret i1 %cmp
+define zeroext i1 @fcmp_float2(float %a, float %b) {
+; CHECK-LABEL: fcmp_float2
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une float %a, %b
+  ret i1 %1
 }
 
-define zeroext i1 @fcmp_double1(double %a) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_double1
-; CHECK: fcmp d0, #0.0
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une double %a, 0.000000e+00
-  ret i1 %cmp
+define zeroext i1 @fcmp_double1(double %a) {
+; CHECK-LABEL: fcmp_double1
+; CHECK:       fcmp d0, #0.0
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une double %a, 0.000000e+00
+  ret i1 %1
 }
 
-define zeroext i1 @fcmp_double2(double %a, double %b) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_double2
-; CHECK: fcmp d0, d1
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une double %a, %b
-  ret i1 %cmp
+define zeroext i1 @fcmp_double2(double %a, double %b) {
+; CHECK-LABEL: fcmp_double2
+; CHECK:       fcmp d0, d1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une double %a, %b
+  ret i1 %1
 }
 
 ; Check each fcmp condition
-define float @fcmp_oeq(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_oeq
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, eq
-  %cmp = fcmp oeq float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ogt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ogt
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, gt
-  %cmp = fcmp ogt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_oge(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_oge
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, ge
-  %cmp = fcmp oge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_olt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_olt
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, mi
-  %cmp = fcmp olt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ole(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ole
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, ls
-  %cmp = fcmp ole float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ord(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ord
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, vc
-  %cmp = fcmp ord float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_uno(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_uno
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, vs
-  %cmp = fcmp uno float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ugt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ugt
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, hi
-  %cmp = fcmp ugt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_uge(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_uge
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, pl
-  %cmp = fcmp uge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ult(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ult
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, lt
-  %cmp = fcmp ult float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ule(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ule
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, le
-  %cmp = fcmp ule float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_une(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_une
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, ne
-  %cmp = fcmp une float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_false(float %a) {
+; CHECK-LABEL: fcmp_false
+; CHECK:       mov {{w[0-9]+}}, wzr
+  %1 = fcmp ogt float %a, %a
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_oeq(float %a, float %b) {
+; CHECK-LABEL: fcmp_oeq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, eq
+  %1 = fcmp oeq float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ogt(float %a, float %b) {
+; CHECK-LABEL: fcmp_ogt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, gt
+  %1 = fcmp ogt float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_oge(float %a, float %b) {
+; CHECK-LABEL: fcmp_oge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ge
+  %1 = fcmp oge float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_olt(float %a, float %b) {
+; CHECK-LABEL: fcmp_olt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, mi
+  %1 = fcmp olt float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ole(float %a, float %b) {
+; CHECK-LABEL: fcmp_ole
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ls
+  %1 = fcmp ole float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_one(float %a, float %b) {
+; CHECK-LABEL: fcmp_one
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], mi
+; CHECK-NEXT:  csinc {{w[0-9]+}}, [[REG]], wzr, le
+  %1 = fcmp one float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ord(float %a, float %b) {
+; CHECK-LABEL: fcmp_ord
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, vc
+  %1 = fcmp ord float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uno(float %a, float %b) {
+; CHECK-LABEL: fcmp_uno
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, vs
+  %1 = fcmp uno float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ueq(float %a, float %b) {
+; CHECK-LABEL: fcmp_ueq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  csinc {{w[0-9]+}}, [[REG]], wzr, vc
+  %1 = fcmp ueq float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ugt(float %a, float %b) {
+; CHECK-LABEL: fcmp_ugt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, hi
+  %1 = fcmp ugt float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_uge(float %a, float %b) {
+; CHECK-LABEL: fcmp_uge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, pl
+  %1 = fcmp uge float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ult(float %a, float %b) {
+; CHECK-LABEL: fcmp_ult
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, lt
+  %1 = fcmp ult float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ule(float %a, float %b) {
+; CHECK-LABEL: fcmp_ule
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, le
+  %1 = fcmp ule float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_une(float %a, float %b) {
+; CHECK-LABEL: fcmp_une
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_true(float %a) {
+; CHECK-LABEL: fcmp_true
+; CHECK:       orr {{w[0-9]+}}, wzr, #0x1
+  %1 = fcmp ueq float %a, %a
+  ret i1 %1
 }
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
index dc4d895..1a4e8ea 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 ; Test load/store of global value from global offset table.
 @seed = common global i64 0, align 8
@@ -6,9 +6,9 @@
 define void @Initrand() nounwind {
 entry:
 ; CHECK: @Initrand
-; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
-; CHECK: str x{{[0-9]+}}, [x[[REG2]]]
+; CHECK: adrp [[REG:x[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr  [[REG2:x[0-9]+]], {{\[}}[[REG]], _seed@GOTPAGEOFF{{\]}}
+; CHECK: str  {{x[0-9]+}}, {{\[}}[[REG2]]{{\]}}
   store i64 74755, i64* @seed, align 8
   ret void
 }
@@ -16,17 +16,16 @@ entry:
 define i32 @Rand() nounwind {
 entry:
 ; CHECK: @Rand
-; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
-; CHECK: movz x[[REG3:[0-9]+]], #0x51d
-; CHECK: ldr x[[REG4:[0-9]+]], [x[[REG2]]]
-; CHECK: mul x[[REG5:[0-9]+]], x[[REG4]], x[[REG3]]
-; CHECK: movz x[[REG6:[0-9]+]], #0x3619
-; CHECK: add x[[REG7:[0-9]+]], x[[REG5]], x[[REG6]]
-; CHECK: orr x[[REG8:[0-9]+]], xzr, #0xffff
-; CHECK: and x[[REG9:[0-9]+]], x[[REG7]], x[[REG8]]
-; CHECK: str x[[REG9]], [x[[REG]]]
-; CHECK: ldr x{{[0-9]+}}, [x[[REG]]]
+; CHECK: adrp [[REG1:x[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr  [[REG2:x[0-9]+]], {{\[}}[[REG1]], _seed@GOTPAGEOFF{{\]}}
+; CHECK: movz [[REG3:x[0-9]+]], #0x3619
+; CHECK: movz [[REG4:x[0-9]+]], #0x51d
+; CHECK: ldr  [[REG5:x[0-9]+]], {{\[}}[[REG2]]{{\]}}
+; CHECK: mul  [[REG6:x[0-9]+]], [[REG5]], [[REG4]]
+; CHECK: add  [[REG7:x[0-9]+]], [[REG6]], [[REG3]]
+; CHECK: and  [[REG8:x[0-9]+]], [[REG7]], #0xffff
+; CHECK: str  [[REG8]], {{\[}}[[REG1]]{{\]}}
+; CHECK: ldr  {{x[0-9]+}}, {{\[}}[[REG1]]{{\]}}
   %0 = load i64* @seed, align 8
   %mul = mul nsw i64 %0, 1309
   %add = add nsw i64 %mul, 13849
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
index 971be5c..245c70e 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
 entry:
-; CHECK: icmp_eq_imm
-; CHECK: cmp  w0, #31
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_imm
+; CHECK:       cmp w0, #31
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i32 %a, 31
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -12,19 +12,19 @@ entry:
 
 define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
 entry:
-; CHECK: icmp_eq_neg_imm
-; CHECK: cmn  w0, #7
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_neg_imm
+; CHECK:       cmn w0, #7
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i32 %a, -7
   %conv = zext i1 %cmp to i32
   ret i32 %conv
 }
 
-define i32 @icmp_eq(i32 %a, i32 %b) nounwind ssp {
+define i32 @icmp_eq_i32(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_eq
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_i32
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -32,19 +32,39 @@ entry:
 
 define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_ne
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, ne
+; CHECK-LABEL: icmp_ne
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, ne
   %cmp = icmp ne i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
 }
 
+define i32 @icmp_eq_ptr(i8* %a) {
+entry:
+; CHECK-LABEL: icmp_eq_ptr
+; CHECK:       cmp x0, #0
+; CHECK-NEXT:  cset {{.+}}, eq
+  %cmp = icmp eq i8* %a, null
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ne_ptr(i8* %a) {
+entry:
+; CHECK-LABEL: icmp_ne_ptr
+; CHECK:       cmp x0, #0
+; CHECK-NEXT:  cset {{.+}}, ne
+  %cmp = icmp ne i8* %a, null
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
 define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_ugt
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, hi
+; CHECK-LABEL: icmp_ugt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, hi
   %cmp = icmp ugt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -52,9 +72,9 @@ entry:
 
 define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_uge
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, hs
+; CHECK-LABEL: icmp_uge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, hs
   %cmp = icmp uge i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -62,9 +82,9 @@ entry:
 
 define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_ult
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, lo
+; CHECK-LABEL: icmp_ult
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, lo
   %cmp = icmp ult i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -72,9 +92,9 @@ entry:
 
 define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_ule
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, ls
+; CHECK-LABEL: icmp_ule
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, ls
   %cmp = icmp ule i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -82,9 +102,9 @@ entry:
 
 define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_sgt
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, gt
+; CHECK-LABEL: icmp_sgt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, gt
   %cmp = icmp sgt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -92,9 +112,9 @@ entry:
 
 define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_sge
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, ge
+; CHECK-LABEL: icmp_sge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, ge
   %cmp = icmp sge i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -102,9 +122,9 @@ entry:
 
 define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_slt
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, lt
+; CHECK-LABEL: icmp_slt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, lt
   %cmp = icmp slt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -112,9 +132,9 @@ entry:
 
 define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_sle
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, le
+; CHECK-LABEL: icmp_sle
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, le
   %cmp = icmp sle i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -122,9 +142,9 @@ entry:
 
 define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
 entry:
-; CHECK: icmp_i64
-; CHECK: cmp  x0, x1
-; CHECK: cset w{{[0-9]+}}, le
+; CHECK-LABEL: icmp_i64
+; CHECK:       cmp  x0, x1
+; CHECK-NEXT:  cset w{{[0-9]+}}, le
   %cmp = icmp sle i64 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -132,33 +152,30 @@ entry:
 
 define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
 entry:
-; CHECK: icmp_eq_i16
-; CHECK: sxth w0, w0
-; CHECK: sxth w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_i16
+; CHECK:       sxth w0, w0
+; CHECK:       cmp w0, w1, sxth
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i16 %a, %b
   ret i1 %cmp
 }
 
 define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
 entry:
-; CHECK: icmp_eq_i8
-; CHECK: sxtb w0, w0
-; CHECK: sxtb w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_i8
+; CHECK:       sxtb w0, w0
+; CHECK-NEXT:  cmp w0, w1, sxtb
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i8 %a, %b
   ret i1 %cmp
 }
 
 define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
 entry:
-; CHECK: icmp_i16_unsigned
-; CHECK: uxth w0, w0
-; CHECK: uxth w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, lo
+; CHECK-LABEL: icmp_i16_unsigned
+; CHECK:       uxth w0, w0
+; CHECK-NEXT:  cmp w0, w1, uxth
+; CHECK-NEXT:  cset w0, lo
   %cmp = icmp ult i16 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -166,24 +183,34 @@ entry:
 
 define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
 entry:
-; CHECK: @icmp_i8_signed
-; CHECK: sxtb w0, w0
-; CHECK: sxtb w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, gt
+; CHECK-LABEL: icmp_i8_signed
+; CHECK:       sxtb w0, w0
+; CHECK-NEXT:  cmp w0, w1, sxtb
+; CHECK-NEXT:  cset w0, gt
   %cmp = icmp sgt i8 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
 }
 
+define i32 @icmp_i1_signed(i1 %a, i1 %b) nounwind {
+entry:
+; CHECK-LABEL: icmp_i1_signed
+; CHECK:       sbfx [[REG1:w[0-9]+]], w0, #0, #1
+; CHECK-NEXT:  sbfx [[REG2:w[0-9]+]], w1, #0, #1
+; CHECK-NEXT:  cmp  [[REG1]], [[REG2]]
+; CHECK-NEXT:  cset w0, gt
+  %cmp = icmp sgt i1 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
 
 define i32 @icmp_i16_signed_const(i16 %a) nounwind {
 entry:
-; CHECK: icmp_i16_signed_const
-; CHECK: sxth w0, w0
-; CHECK: cmn  w0, #233
-; CHECK: cset w0, lt
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: icmp_i16_signed_const
+; CHECK:       sxth w0, w0
+; CHECK-NEXT:  cmn w0, #233
+; CHECK-NEXT:  cset w0, lt
+; CHECK-NEXT:  and w0, w0, #0x1
   %cmp = icmp slt i16 %a, -233
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -191,11 +218,11 @@ entry:
 
 define i32 @icmp_i8_signed_const(i8 %a) nounwind {
 entry:
-; CHECK: icmp_i8_signed_const
-; CHECK: sxtb w0, w0
-; CHECK: cmp  w0, #124
-; CHECK: cset w0, gt
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: icmp_i8_signed_const
+; CHECK:       sxtb w0, w0
+; CHECK-NEXT:  cmp w0, #124
+; CHECK-NEXT:  cset w0, gt
+; CHECK-NEXT:  and w0, w0, #0x1
   %cmp = icmp sgt i8 %a, 124
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -203,11 +230,11 @@ entry:
 
 define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
 entry:
-; CHECK: icmp_i1_unsigned_const
-; CHECK: and w0, w0, #0x1
-; CHECK: cmp  w0, #0
-; CHECK: cset w0, lo
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: icmp_i1_unsigned_const
+; CHECK:       and w0, w0, #0x1
+; CHECK-NEXT:  cmp w0, #0
+; CHECK-NEXT:  cset w0, lo
+; CHECK-NEXT:  and w0, w0, #0x1
   %cmp = icmp ult i1 %a, 0
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
index 70335ac..a5f4524 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 @fn.table = internal global [2 x i8*] [i8* blockaddress(@fn, %ZERO), i8* blockaddress(@fn, %ONE)], align 8
 
 define i32 @fn(i32 %target) nounwind {
 entry:
-; CHECK: @fn
+; CHECK-LABEL: fn
   %retval = alloca i32, align 4
   %target.addr = alloca i32, align 4
   store i32 %target, i32* %target.addr, align 4
@@ -29,8 +29,8 @@ return:                                           ; preds = %ONE, %ZERO
   ret i32 %2
 
 indirectgoto:                                     ; preds = %entry
-; CHECK: ldr x0, [sp]
-; CHECK: br x0
+; CHECK:      ldr [[REG:x[0-9]+]], [sp]
+; CHECK-NEXT: br [[REG]]
   %indirect.goto.dest = phi i8* [ %1, %entry ]
   indirectbr i8* %indirect.goto.dest, [label %ZERO, label %ONE]
 }
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
index 1152988..9ac3e44 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios | FileCheck %s --check-prefix=ARM64
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios < %s | FileCheck %s --check-prefix=ARM64
 
 @message = global [80 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 16
 @temp = common global [80 x i8] zeroinitializer, align 16
@@ -7,7 +7,7 @@ define void @t1() {
 ; ARM64-LABEL: t1
 ; ARM64: adrp x8, _message@PAGE
 ; ARM64: add x0, x8, _message@PAGEOFF
-; ARM64: movz w9, #0
+; ARM64: mov w9, wzr
 ; ARM64: movz x2, #0x50
 ; ARM64: uxtb w1, w9
 ; ARM64: bl _memset
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
index ffac131..1dea5d9 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
@@ -1,27 +1,41 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 ; Materialize using fmov
-define void @float_(float* %value) {
-; CHECK: @float_
-; CHECK: fmov s0, #1.25000000
-  store float 1.250000e+00, float* %value, align 4
-  ret void
+define float @fmov_float1() {
+; CHECK-LABEL: fmov_float1
+; CHECK:       fmov s0, #1.25000000
+  ret float 1.250000e+00
 }
 
-define void @double_(double* %value) {
-; CHECK: @double_
-; CHECK: fmov d0, #1.25000000
-  store double 1.250000e+00, double* %value, align 8
-  ret void
+define float @fmov_float2() {
+; CHECK-LABEL: fmov_float2
+; CHECK:       fmov s0, wzr
+  ret float 0.0e+00
+}
+
+define double @fmov_double1() {
+; CHECK-LABEL: fmov_double1
+; CHECK:       fmov d0, #1.25000000
+  ret double 1.250000e+00
+}
+
+define double @fmov_double2() {
+; CHECK-LABEL: fmov_double2
+; CHECK:       fmov d0, xzr
+  ret double 0.0e+00
 }
 
 ; Materialize from constant pool
-define float @float_cp() {
-; CHECK: @float_cp
+define float @cp_float() {
+; CHECK-LABEL: cp_float
+; CHECK:       adrp [[REG:x[0-9]+]], {{lCPI[0-9]+_0}}@PAGE
+; CHECK-NEXT:  ldr s0, {{\[}}[[REG]], {{lCPI[0-9]+_0}}@PAGEOFF{{\]}}
   ret float 0x400921FB60000000
 }
 
-define double @double_cp() {
-; CHECK: @double_cp
+define double @cp_double() {
+; CHECK-LABEL: cp_double
+; CHECK:       adrp [[REG:x[0-9]+]], {{lCPI[0-9]+_0}}@PAGE
+; CHECK-NEXT:  ldr d0, {{\[}}[[REG]], {{lCPI[0-9]+_0}}@PAGEOFF{{\]}}
   ret double 0x400921FB54442D18
 }
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
index 483d179..81daa7c 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O0 %s -o - | FileCheck %s
+; RUN: llc -O0 -verify-machineinstrs -mtriple=aarch64-apple-ios < %s | FileCheck %s
 
 ; Fast-isel can't do vector conversions yet, but it was emitting some highly
 ; suspect UCVTFUWDri MachineInstrs.
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-rem.ll b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
index d5bdbaa..26f9afa 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
@@ -1,7 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 ; RUN: llc %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -print-machineinstrs=expand-isel-pseudos -o /dev/null 2> %t
 ; RUN: FileCheck %s < %t --check-prefix=CHECK-SSA
-; REQUIRES: asserts
 
 ; CHECK-SSA-LABEL: Machine code for function t1
 
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
index d91fd28..f84c755 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 ;; Test returns.
 define void @t0() nounwind ssp {
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-select.ll b/test/CodeGen/AArch64/arm64-fast-isel-select.ll
deleted file mode 100644
index 1cc207f..0000000
--- a/test/CodeGen/AArch64/arm64-fast-isel-select.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i32 @t1(i32 %c) nounwind readnone {
-entry:
-; CHECK: @t1
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
-  %0 = icmp sgt i32 %c, 1
-  %1 = select i1 %0, i32 123, i32 357
-  ret i32 %1
-}
-
-define i64 @t2(i32 %c) nounwind readnone {
-entry:
-; CHECK: @t2
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
-  %0 = icmp sgt i32 %c, 1
-  %1 = select i1 %0, i64 123, i64 357
-  ret i64 %1
-}
-
-define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
-entry:
-; CHECK: @t3
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
-  %0 = select i1 %c, i32 %a, i32 %b
-  ret i32 %0
-}
-
-define i64 @t4(i1 %c, i64 %a, i64 %b) nounwind readnone {
-entry:
-; CHECK: @t4
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
-  %0 = select i1 %c, i64 %a, i64 %b
-  ret i64 %0
-}
-
-define float @t5(i1 %c, float %a, float %b) nounwind readnone {
-entry:
-; CHECK: @t5
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: fcsel s0, s0, s1, ne
-  %0 = select i1 %c, float %a, float %b
-  ret float %0
-}
-
-define double @t6(i1 %c, double %a, double %b) nounwind readnone {
-entry:
-; CHECK: @t6
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: fcsel d0, d0, d1, ne
-  %0 = select i1 %c, double %a, double %b
-  ret double %0
-}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-store.ll b/test/CodeGen/AArch64/arm64-fast-isel-store.ll
new file mode 100644
index 0000000..9494d55
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-store.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple=aarch64-unknown-unknown                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-unknown -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+define void @store_i8(i8* %a) {
+; CHECK-LABEL: store_i8
+; CHECK: strb  wzr, [x0]
+  store i8 0, i8* %a
+  ret void
+}
+
+define void @store_i16(i16* %a) {
+; CHECK-LABEL: store_i16
+; CHECK: strh  wzr, [x0]
+  store i16 0, i16* %a
+  ret void
+}
+
+define void @store_i32(i32* %a) {
+; CHECK-LABEL: store_i32
+; CHECK: str  wzr, [x0]
+  store i32 0, i32* %a
+  ret void
+}
+
+define void @store_i64(i64* %a) {
+; CHECK-LABEL: store_i64
+; CHECK: str  xzr, [x0]
+  store i64 0, i64* %a
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel.ll b/test/CodeGen/AArch64/arm64-fast-isel.ll
index 0194b3a..4349946 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 define void @t0(i32 %a) nounwind {
 entry:
@@ -66,8 +66,7 @@ entry:
 define void @t4(i32 *%ptr) nounwind {
 entry:
 ; CHECK-LABEL: t4:
-; CHECK: movz w8, #0
-; CHECK: stur w8, [x0, #-4]
+; CHECK: stur wzr, [x0, #-4]
 ; CHECK: ret
   %0 = getelementptr i32 *%ptr, i32 -1
   store i32 0, i32* %0, align 4
@@ -77,8 +76,7 @@ entry:
 define void @t5(i32 *%ptr) nounwind {
 entry:
 ; CHECK-LABEL: t5:
-; CHECK: movz w8, #0
-; CHECK: stur w8, [x0, #-256]
+; CHECK: stur wzr, [x0, #-256]
 ; CHECK: ret
   %0 = getelementptr i32 *%ptr, i32 -64
   store i32 0, i32* %0, align 4
diff --git a/test/CodeGen/AArch64/arm64-frameaddr.ll b/test/CodeGen/AArch64/arm64-frameaddr.ll
deleted file mode 100644
index 469078c..0000000
--- a/test/CodeGen/AArch64/arm64-frameaddr.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i8* @t() nounwind {
-entry:
-; CHECK-LABEL: t:
-; CHECK: stp x29, x30, [sp, #-16]!
-; CHECK: mov x29, sp
-; CHECK: mov x0, x29
-; CHECK: ldp x29, x30, [sp], #16
-; CHECK: ret
-	%0 = call i8* @llvm.frameaddress(i32 0)
-        ret i8* %0
-}
-
-declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-indexed-memory.ll b/test/CodeGen/AArch64/arm64-indexed-memory.ll
index e501c6e..a8620f4 100644
--- a/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -349,3 +349,15 @@ define i8* @preidx8sext64(i8* %src, i64* %out) {
   store i64 %ext, i64* %out, align 4
   ret i8* %ptr
 }
+
+; This test checks if illegal post-index is generated
+
+define i64* @postidx_clobber(i64* %addr) nounwind noinline ssp {
+; CHECK-LABEL: postidx_clobber:
+; CHECK-NOT: str     x0, [x0], #8
+; ret
+ %paddr = bitcast i64* %addr to i64**
+ store i64* %addr, i64** %paddr
+ %newaddr = getelementptr i64* %addr, i32 1
+ ret i64* %newaddr
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
index d76cca3..9c8bcaa 100644
--- a/test/CodeGen/AArch64/arm64-inline-asm.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -87,13 +87,17 @@ entry:
   ret i32 %1
 }
 
-define i32 @constraint_J(i32 %i, i32 %j) nounwind {
+define i32 @constraint_J(i32 %i, i32 %j, i64 %k) nounwind {
 entry:
   ; CHECK-LABEL: constraint_J:
   %0 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -16773120) nounwind
-  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4278194176
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #-16773120
   %1 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -1) nounwind
-  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4294967295
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #-1
+  %2 = tail call i64 asm sideeffect "sub ${0:x}, ${1:x}, $2", "=r,r,J"(i64 %k, i32 -1) nounwind
+  ; CHECK: sub   {{x[0-9]+}}, {{x[0-9]+}}, #-1
+  %3 = tail call i64 asm sideeffect "sub ${0:x}, ${1:x}, $2", "=r,r,J"(i64 %k, i64 -1) nounwind
+  ; CHECK: sub   {{x[0-9]+}}, {{x[0-9]+}}, #-1
   ret i32 %1
 }
 
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
new file mode 100644
index 0000000..d39722b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone < %s | FileCheck %s
+
+; Test that scratch registers are spilled around patchpoints
+; CHECK: InlineAsm End
+; CHECK-NEXT: mov x{{[0-9]+}}, x16
+; CHECK-NEXT: mov x{{[0-9]+}}, x17
+; CHECK-NEXT: Ltmp
+; CHECK-NEXT: nop
+define void @clobberScratch(i32* %p) {
+  %v = load i32* %p
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
+  store i32 %v, i32* %p
+  ret void
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
new file mode 100644
index 0000000..8f79f80
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
@@ -0,0 +1,118 @@
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone            < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone -fast-isel < %s | FileCheck %s --check-prefix=FAST
+
+; One argument will be passed in register, the other will be pushed on the stack.
+; Return value in x0.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK:       Ltmp
+; CHECK:       str x{{.+}}, [sp]
+; CHECK-NEXT:  mov  x0, x{{.+}}
+; CHECK:       Ltmp
+; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
+; CHECK-NEXT:  movk  x16, #0xdead, lsl #16
+; CHECK-NEXT:  movk  x16, #0xbeef
+; CHECK-NEXT:  blr x16
+; FAST-LABEL:  jscall_patchpoint_codegen:
+; FAST:        Ltmp
+; FAST:        str x{{.+}}, [sp]
+; FAST:        Ltmp
+; FAST-NEXT:   movz  x16, #0xffff, lsl #32
+; FAST-NEXT:   movk  x16, #0xdead, lsl #16
+; FAST-NEXT:   movk  x16, #0xbeef
+; FAST-NEXT:   blr x16
+  %resolveCall2 = inttoptr i64 281474417671919 to i8*
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %resolveCall3 = inttoptr i64 244837814038255 to i8*
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  ret void
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK:       Ltmp
+; CHECK:       orr w[[REG:[0-9]+]], wzr, #0x6
+; CHECK-NEXT:  str x[[REG]], [sp, #24]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK-NEXT:  str w[[REG]], [sp, #16]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x2
+; CHECK-NEXT:  str x[[REG]], [sp]
+; CHECK:       Ltmp
+; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
+; CHECK-NEXT:  movk  x16, #0xdead, lsl #16
+; CHECK-NEXT:  movk  x16, #0xbeef
+; CHECK-NEXT:  blr x16
+; FAST-LABEL:  jscall_patchpoint_codegen2:
+; FAST:        Ltmp
+; FAST:        orr [[REG1:x[0-9]+]], xzr, #0x2
+; FAST-NEXT:   orr [[REG2:w[0-9]+]], wzr, #0x4
+; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
+; FAST-NEXT:   str [[REG1]], [sp]
+; FAST-NEXT:   str [[REG2]], [sp, #16]
+; FAST-NEXT:   str [[REG3]], [sp, #24]
+; FAST:        Ltmp
+; FAST-NEXT:   movz  x16, #0xffff, lsl #32
+; FAST-NEXT:   movk  x16, #0xdead, lsl #16
+; FAST-NEXT:   movk  x16, #0xbeef
+; FAST-NEXT:   blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK:       Ltmp
+; CHECK:       movz  w[[REG:[0-9]+]], #0xa
+; CHECK-NEXT:  str x[[REG]], [sp, #48]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x8
+; CHECK-NEXT:  str w[[REG]], [sp, #36]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x6
+; CHECK-NEXT:  str x[[REG]], [sp, #24]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK-NEXT:  str w[[REG]], [sp, #16]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x2
+; CHECK-NEXT:  str x[[REG]], [sp]
+; CHECK:       Ltmp
+; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
+; CHECK-NEXT:  movk  x16, #0xdead, lsl #16
+; CHECK-NEXT:  movk  x16, #0xbeef
+; CHECK-NEXT:  blr x16
+; FAST-LABEL:  jscall_patchpoint_codegen3:
+; FAST:        Ltmp
+; FAST:        orr [[REG1:x[0-9]+]], xzr, #0x2
+; FAST-NEXT:   orr [[REG2:w[0-9]+]], wzr, #0x4
+; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
+; FAST-NEXT:   orr [[REG4:w[0-9]+]], wzr, #0x8
+; FAST-NEXT:   movz [[REG5:x[0-9]+]], #0xa
+; FAST-NEXT:   str [[REG1]], [sp]
+; FAST-NEXT:   str [[REG2]], [sp, #16]
+; FAST-NEXT:   str [[REG3]], [sp, #24]
+; FAST-NEXT:   str [[REG4]], [sp, #36]
+; FAST-NEXT:   str [[REG5]], [sp, #48]
+; FAST:        Ltmp
+; FAST-NEXT:   movz  x16, #0xffff, lsl #32
+; FAST-NEXT:   movk  x16, #0xdead, lsl #16
+; FAST-NEXT:   movk  x16, #0xbeef
+; FAST-NEXT:   blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  ret i64 %result
+}
+
+; CHECK-LABEL: test_i16:
+; CHECK: ldrh [[BREG:w[0-9]+]], [sp]
+; CHECK: add {{w[0-9]+}}, w0, [[BREG]]
+define webkit_jscc zeroext i16 @test_i16(i16 zeroext %a, i16 zeroext %b) {
+  %sum = add i16 %a, %b
+  ret i16 %sum
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+
diff --git a/test/CodeGen/AArch64/arm64-patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll
index 039cdfc..278cba5 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone                             < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone -fast-isel -fast-isel-abort < %s | FileCheck %s
 
 ; Trivial patchpoint codegen
 ;
@@ -41,73 +42,6 @@ entry:
   ret void
 }
 
-; Test the webkit_jscc calling convention.
-; One argument will be passed in register, the other will be pushed on the stack.
-; Return value in x0.
-define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen:
-; CHECK:      Ltmp
-; CHECK:      str x{{.+}}, [sp]
-; CHECK-NEXT: mov  x0, x{{.+}}
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #0xffff, lsl #32
-; CHECK-NEXT: movk  x16, #0xdead, lsl #16
-; CHECK-NEXT: movk  x16, #0xbeef
-; CHECK-NEXT: blr x16
-  %resolveCall2 = inttoptr i64 281474417671919 to i8*
-  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
-  %resolveCall3 = inttoptr i64 244837814038255 to i8*
-  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
-  ret void
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen2(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen2:
-; CHECK:      Ltmp
-; CHECK:      orr w{{.+}}, wzr, #0x6
-; CHECK-NEXT: str x{{.+}}, [sp, #24]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
-; CHECK-NEXT: str w{{.+}}, [sp, #16]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
-; CHECK-NEXT: str x{{.+}}, [sp]
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #0xffff, lsl #32
-; CHECK-NEXT: movk  x16, #0xdead, lsl #16
-; CHECK-NEXT: movk  x16, #0xbeef
-; CHECK-NEXT: blr x16
-  %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
-  ret i64 %result
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen3(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen3:
-; CHECK:      Ltmp
-; CHECK:      movz  w{{.+}}, #0xa
-; CHECK-NEXT: str x{{.+}}, [sp, #48]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x8
-; CHECK-NEXT: str w{{.+}}, [sp, #36]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x6
-; CHECK-NEXT: str x{{.+}}, [sp, #24]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
-; CHECK-NEXT: str w{{.+}}, [sp, #16]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
-; CHECK-NEXT: str x{{.+}}, [sp]
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #0xffff, lsl #32
-; CHECK-NEXT: movk  x16, #0xdead, lsl #16
-; CHECK-NEXT: movk  x16, #0xbeef
-; CHECK-NEXT: blr x16
-  %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
-  ret i64 %result
-}
-
 ; Test patchpoints reusing the same TargetConstant.
 ; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
 ; There is no way to verify this, since it depends on memory allocation.
@@ -144,28 +78,7 @@ entry:
   ret void
 }
 
-; Test that scratch registers are spilled around patchpoints
-; CHECK: InlineAsm End
-; CHECK-NEXT: mov x{{[0-9]+}}, x16
-; CHECK-NEXT: mov x{{[0-9]+}}, x17
-; CHECK-NEXT: Ltmp
-; CHECK-NEXT: nop
-define void @clobberScratch(i32* %p) {
-  %v = load i32* %p
-  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
-  store i32 %v, i32* %p
-  ret void
-}
-
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
 
-; CHECK-LABEL: test_i16:
-; CHECK: ldrh [[BREG:w[0-9]+]], [sp]
-; CHECK: add w0, w0, [[BREG]]
-define webkit_jscc i16 @test_i16(i16 zeroext %a, i16 zeroext %b) {
-  %sum = add i16 %a, %b
-  ret i16 %sum
-}
diff --git a/test/CodeGen/AArch64/arm64-popcnt.ll b/test/CodeGen/AArch64/arm64-popcnt.ll
index 2afade2..117ab3a 100644
--- a/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=aarch64 -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s
 
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -8,6 +9,13 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK: uaddlv.8b	h0, v0
 ; CHECK: fmov w0, s0
 ; CHECK: ret
+; CHECK-NONEON-LABEL: cnt32_advsimd
+; CHECK-NONEON-NOT: 8b
+; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0x55555555
+; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0x33333333
+; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0xf0f0f0f
+; CHECK-NONEON: mul
+
 }
 
 define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
@@ -18,6 +26,12 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
 ; CHECK: uaddlv.8b	h0, v0
 ; CHECK: fmov	w0, s0
 ; CHECK: ret
+; CHECK-NONEON-LABEL: cnt64_advsimd
+; CHECK-NONEON-NOT: 8b
+; CHECK-NONEON: and x{{[0-9]+}}, x{{[0-9]+}}, #0x5555555555555555
+; CHECK-NONEON: and x{{[0-9]+}}, x{{[0-9]+}}, #0x3333333333333333
+; CHECK-NONEON: and x{{[0-9]+}}, x{{[0-9]+}}, #0xf0f0f0f0f0f0f0f
+; CHECK-NONEON: mul
 }
 
 ; Do not use AdvSIMD when -mno-implicit-float is specified.
diff --git a/test/CodeGen/AArch64/arm64-prefetch.ll b/test/CodeGen/AArch64/arm64-prefetch.ll
index b2e06ed..9dc6301 100644
--- a/test/CodeGen/AArch64/arm64-prefetch.ll
+++ b/test/CodeGen/AArch64/arm64-prefetch.ll
@@ -17,6 +17,15 @@ entry:
   ; CHECK: prfum pldl1keep
   call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 1)
 
+  ; CHECK: prfum plil1strm
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 0, i32 0)
+  ; CHECK: prfum plil3keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 1, i32 0)
+  ; CHECK: prfum plil2keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 2, i32 0)
+  ; CHECK: prfum plil1keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 0)
+
   ; CHECK: prfum pstl1strm
   call void @llvm.prefetch(i8* %tmp, i32 1, i32 0, i32 1)
   ; CHECK: prfum pstl3keep
@@ -57,26 +66,52 @@ entry:
   %arrayidx12 = getelementptr inbounds i32* %tmp10, i64 %idxprom
   %tmp11 = bitcast i32* %arrayidx12 to i8*
 
-  ; CHECK: prfm pstl1strm
-  call void @llvm.prefetch(i8* %tmp11, i32 1, i32 0, i32 1)
+
+  ; CHECK: prfm plil1strm
+  call void @llvm.prefetch(i8* %tmp11, i32 0, i32 0, i32 0)
   %tmp12 = load i32** @a, align 8, !tbaa !3
   %arrayidx15 = getelementptr inbounds i32* %tmp12, i64 %idxprom
-  %tmp13 = bitcast i32* %arrayidx15 to i8*
+  %tmp13 = bitcast i32* %arrayidx3 to i8*
 
-  ; CHECK: prfm pstl3keep
-  call void @llvm.prefetch(i8* %tmp13, i32 1, i32 1, i32 1)
+  ; CHECK: prfm plil3keep
+  call void @llvm.prefetch(i8* %tmp13, i32 0, i32 1, i32 0)
   %tmp14 = load i32** @a, align 8, !tbaa !3
   %arrayidx18 = getelementptr inbounds i32* %tmp14, i64 %idxprom
-  %tmp15 = bitcast i32* %arrayidx18 to i8*
+  %tmp15 = bitcast i32* %arrayidx6 to i8*
 
-  ; CHECK: prfm pstl2keep
-  call void @llvm.prefetch(i8* %tmp15, i32 1, i32 2, i32 1)
+  ; CHECK: prfm plil2keep
+  call void @llvm.prefetch(i8* %tmp15, i32 0, i32 2, i32 0)
   %tmp16 = load i32** @a, align 8, !tbaa !3
   %arrayidx21 = getelementptr inbounds i32* %tmp16, i64 %idxprom
-  %tmp17 = bitcast i32* %arrayidx21 to i8*
+  %tmp17 = bitcast i32* %arrayidx9 to i8*
+
+  ; CHECK: prfm plil1keep
+  call void @llvm.prefetch(i8* %tmp17, i32 0, i32 3, i32 0)
+  %tmp18 = load i32** @a, align 8, !tbaa !3
+  %arrayidx24 = getelementptr inbounds i32* %tmp18, i64 %idxprom
+  %tmp19 = bitcast i32* %arrayidx12 to i8*
+
+
+  ; CHECK: prfm pstl1strm
+  call void @llvm.prefetch(i8* %tmp19, i32 1, i32 0, i32 1)
+  %tmp20 = load i32** @a, align 8, !tbaa !3
+  %arrayidx27 = getelementptr inbounds i32* %tmp20, i64 %idxprom
+  %tmp21 = bitcast i32* %arrayidx15 to i8*
+
+  ; CHECK: prfm pstl3keep
+  call void @llvm.prefetch(i8* %tmp21, i32 1, i32 1, i32 1)
+  %tmp22 = load i32** @a, align 8, !tbaa !3
+  %arrayidx30 = getelementptr inbounds i32* %tmp22, i64 %idxprom
+  %tmp23 = bitcast i32* %arrayidx18 to i8*
+
+  ; CHECK: prfm pstl2keep
+  call void @llvm.prefetch(i8* %tmp23, i32 1, i32 2, i32 1)
+  %tmp24 = load i32** @a, align 8, !tbaa !3
+  %arrayidx33 = getelementptr inbounds i32* %tmp24, i64 %idxprom
+  %tmp25 = bitcast i32* %arrayidx21 to i8*
 
   ; CHECK: prfm pstl1keep
-  call void @llvm.prefetch(i8* %tmp17, i32 1, i32 3, i32 1)
+  call void @llvm.prefetch(i8* %tmp25, i32 1, i32 3, i32 1)
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-scvt.ll b/test/CodeGen/AArch64/arm64-scvt.ll
index 2e006cf..8baaf22 100644
--- a/test/CodeGen/AArch64/arm64-scvt.ll
+++ b/test/CodeGen/AArch64/arm64-scvt.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -mcpu=cortex-a57 | FileCheck --check-prefix=CHECK-A57 %s
 ; rdar://13082402
 
 define float @t1(i32* nocapture %src) nounwind ssp {
@@ -409,6 +410,10 @@ define float @sfct1(i8* nocapture %sp0) {
 ; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
 ; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct1:
+; CHECK-A57: ldrsb w[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-A57-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul s0, [[REG]], [[REG]]
 entry:
   %addr = getelementptr i8* %sp0, i64 1
   %pix_sp0.0.copyload = load i8* %addr, align 1
@@ -466,6 +471,10 @@ define float @sfct5(i8* nocapture %sp0, i64 %offset) {
 ; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
 ; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct5:
+; CHECK-A57: ldrsb w[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-A57-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul s0, [[REG]], [[REG]]
 entry:
   %addr = getelementptr i8* %sp0, i64 %offset
   %pix_sp0.0.copyload = load i8* %addr, align 1
@@ -536,6 +545,10 @@ define double @sfct10(i16* nocapture %sp0) {
 ; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
 ; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct10:
+; CHECK-A57: ldrsh w[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-A57-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul d0, [[REG]], [[REG]]
 entry:
   %addr = getelementptr i16* %sp0, i64 1
   %pix_sp0.0.copyload = load i16* %addr, align 1
@@ -592,6 +605,10 @@ define double @sfct14(i16* nocapture %sp0, i64 %offset) {
 ; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
 ; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct14:
+; CHECK-A57: ldrsh w[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-A57-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul d0, [[REG]], [[REG]]
 entry:
   %addr = getelementptr i16* %sp0, i64 %offset
   %pix_sp0.0.copyload = load i16* %addr, align 1
@@ -636,6 +653,10 @@ entry:
 ; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
 ; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct17:
+; CHECK-A57: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-A57-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul s0, [[REG]], [[REG]]
   %bitcast = ptrtoint i8* %sp0 to i64
   %add = add i64 %bitcast, -1
   %addr = inttoptr i64 %add to i8*
@@ -713,6 +734,10 @@ define double @sfct22(i16* nocapture %sp0) {
 ; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
 ; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct22:
+; CHECK-A57: ldursh w[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-A57-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul d0, [[REG]], [[REG]]
   %bitcast = ptrtoint i16* %sp0 to i64
   %add = add i64 %bitcast, 1
   %addr = inttoptr i64 %add to i16*
diff --git a/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll b/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll
new file mode 100644
index 0000000..67283b6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -asm-verbose=false -mtriple=arm64-apple-ios | FileCheck %s
+
+define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind {
+; CHECK-LABEL: foo:
+; CHECK-NEXT: fcmeq.4s  v0, v0, v1
+; CHECK-NEXT: fmov.4s v1, #1.00000000
+; CHECK-NEXT: and.16b v0, v0, v1
+; CHECK-NEXT: ret
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %result = sitofp <4 x i32> %ext to <4 x float>
+  ret <4 x float> %result
+}
+; Make sure the operation doesn't try to get folded when the sizes don't match,
+; as that ends up crashing later when trying to form a bitcast operation for
+; the folded nodes.
+define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: movi.4s
+; CHECK: scvtf.2d
+; CHECK: scvtf.2d
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %result = sitofp <4 x i32> %ext to <4 x double>
+  store <4 x double> %result, <4 x double>* %p
+  ret void
+}
+
+; Fold explicit AND operations when the constant isn't a splat of a single
+; scalar value like what the zext creates.
+define <4 x float> @foo2(<4 x float> %val, <4 x float> %test) nounwind {
+; CHECK-LABEL: lCPI2_0:
+; CHECK-NEXT: .long 1065353216
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long 1065353216
+; CHECK-NEXT: .long 0
+; CHECK-LABEL: foo2:
+; CHECK: adrp  x8, lCPI2_0@PAGE
+; CHECK: ldr q2, [x8, lCPI2_0@PAGEOFF]
+; CHECK-NEXT:  fcmeq.4s  v0, v0, v1
+; CHECK-NEXT:  and.16b v0, v0, v2
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %and = and <4 x i32> %ext, <i32 255, i32 256, i32 257, i32 258>
+  %result = sitofp <4 x i32> %and to <4 x float>
+  ret <4 x float> %result
+}
diff --git a/test/CodeGen/AArch64/arm64-shifted-sext.ll b/test/CodeGen/AArch64/arm64-shifted-sext.ll
index b7b4e5d..71f15b1 100644
--- a/test/CodeGen/AArch64/arm64-shifted-sext.ll
+++ b/test/CodeGen/AArch64/arm64-shifted-sext.ll
@@ -166,8 +166,8 @@ entry:
 define i32 @extendedLeftShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: extendedLeftShiftshortTointBy16:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: lsl w0, [[REG]], #16
+; CHECK: lsl [[REG:w[0-9]+]], w0, #16
+; CHECK: add w0, [[REG]], #16, lsl #12
   %inc = add i16 %a, 1
   %conv2 = zext i16 %inc to i32
   %shl = shl nuw i32 %conv2, 16
diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
index 2c7c6ae..144c2fd 100644
--- a/test/CodeGen/AArch64/arm64-stackmap.ll
+++ b/test/CodeGen/AArch64/arm64-stackmap.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin                             < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -fast-isel -fast-isel-abort < %s | FileCheck %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 
diff --git a/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll b/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
new file mode 100644
index 0000000..a7f5215
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -enable-aa-sched-mi | FileCheck %s
+; Check that the scheduler moves the load from a[1] past the store into a[2].
+@a = common global i32* null, align 8
+@m = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define i32 @func(i32 %i, i32 %j, i32 %k) #0 {
+entry:
+; CHECK: ldr {{w[0-9]+}}, [x[[REG:[0-9]+]], #4]
+; CHECK: str {{w[0-9]+}}, [x[[REG]], #8]
+  %0 = load i32** @a, align 8, !tbaa !1
+  %arrayidx = getelementptr inbounds i32* %0, i64 2
+  store i32 %i, i32* %arrayidx, align 4, !tbaa !5
+  %arrayidx1 = getelementptr inbounds i32* %0, i64 1
+  %1 = load i32* %arrayidx1, align 4, !tbaa !5
+  %add = add nsw i32 %k, %i
+  store i32 %add, i32* @m, align 4, !tbaa !5
+  ret i32 %1
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6.0 "}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"any pointer", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !6, metadata !6, i64 0}
+!6 = metadata !{metadata !"int", metadata !3, i64 0}
diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
index 5afc8d9..fae2b90 100644
--- a/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -802,3 +802,73 @@ define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
   %res1 = zext <2 x i32> %res to <2 x i64>
   ret <2 x i64> %res1
 }
+
+define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
+; CHECK-LABEL: abspattern1:
+; CHECK: abs.2s
+; CHECK-NEXT: ret
+        %tmp1neg = sub <2 x i32> zeroinitializer, %a
+        %b = icmp sge <2 x i32> %a, zeroinitializer
+        %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
+        ret <2 x i32> %abs
+}
+
+define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
+; CHECK-LABEL: abspattern2:
+; CHECK: abs.4h
+; CHECK-NEXT: ret
+        %tmp1neg = sub <4 x i16> zeroinitializer, %a
+        %b = icmp sgt <4 x i16> %a, zeroinitializer
+        %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
+        ret <4 x i16> %abs
+}
+
+define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
+; CHECK-LABEL: abspattern3:
+; CHECK: abs.8b
+; CHECK-NEXT: ret
+        %tmp1neg = sub <8 x i8> zeroinitializer, %a
+        %b = icmp slt <8 x i8> %a, zeroinitializer
+        %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
+        ret <8 x i8> %abs
+}
+
+define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
+; CHECK-LABEL: abspattern4:
+; CHECK: abs.4s
+; CHECK-NEXT: ret
+        %tmp1neg = sub <4 x i32> zeroinitializer, %a
+        %b = icmp sge <4 x i32> %a, zeroinitializer
+        %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
+        ret <4 x i32> %abs
+}
+
+define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
+; CHECK-LABEL: abspattern5:
+; CHECK: abs.8h
+; CHECK-NEXT: ret
+        %tmp1neg = sub <8 x i16> zeroinitializer, %a
+        %b = icmp sgt <8 x i16> %a, zeroinitializer
+        %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
+        ret <8 x i16> %abs
+}
+
+define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
+; CHECK-LABEL: abspattern6:
+; CHECK: abs.16b
+; CHECK-NEXT: ret
+        %tmp1neg = sub <16 x i8> zeroinitializer, %a
+        %b = icmp slt <16 x i8> %a, zeroinitializer
+        %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
+        ret <16 x i8> %abs
+}
+
+define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
+; CHECK-LABEL: abspattern7:
+; CHECK: abs.2d
+; CHECK-NEXT: ret
+        %tmp1neg = sub <2 x i64> zeroinitializer, %a
+        %b = icmp sle <2 x i64> %a, zeroinitializer
+        %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
+        ret <2 x i64> %abs
+}
diff --git a/test/CodeGen/AArch64/arm64-vcvt_f.ll b/test/CodeGen/AArch64/arm64-vcvt_f.ll
index d244958..1f393c2 100644
--- a/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -66,17 +66,17 @@ define i16 @to_half(float %in) {
 ; CHECK-LABEL: to_half:
 ; CHECK: fcvt h[[HALFVAL:[0-9]+]], s0
 ; CHECK: fmov {{w[0-9]+}}, {{s[0-9]+}}
-  %res = call i16 @llvm.convert.to.fp16(float %in)
+  %res = call i16 @llvm.convert.to.fp16.f32(float %in)
   ret i16 %res
 }
 
 define float @from_half(i16 %in) {
 ; CHECK-LABEL: from_half:
-; CHECK: fmov s[[HALFVAL:[0-9]+]], {{w[0-9]+}}
-; CHECK: fcvt s0, h[[HALFVAL]]
-  %res = call float @llvm.convert.from.fp16(i16 %in)
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+; CHECK: fcvt s0, {{h[0-9]+}}
+  %res = call float @llvm.convert.from.fp16.f32(i16 %in)
   ret float %res
 }
 
-declare float @llvm.convert.from.fp16(i16) #1
-declare i16 @llvm.convert.to.fp16(float) #1
+declare float @llvm.convert.from.fp16.f32(i16) #1
+declare i16 @llvm.convert.to.fp16.f32(float) #1
diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
index 650ff1e..5bee161 100644
--- a/test/CodeGen/AArch64/arm64-vector-ext.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -14,3 +14,14 @@ define void @func30(%T0_30 %v0, %T1_30* %p1) {
   store %T1_30 %r, %T1_30* %p1
   ret void
 }
+
+; Extend from v1i1 was crashing things (PR20791). Make sure we do something
+; sensible instead.
+define <1 x i32> @autogen_SD7918() {
+; CHECK-LABEL: autogen_SD7918
+; CHECK: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+  %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
+  %ZE = zext <1 x i1> %I29 to <1 x i32>
+  ret <1 x i32> %ZE
+}
diff --git a/test/CodeGen/AArch64/arm64-xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll
index 0c300de..59ce684 100644
--- a/test/CodeGen/AArch64/arm64-xaluo.ll
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -1,13 +1,14 @@
-; RUN: llc < %s -march=arm64 -aarch64-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; Get the actual value of the overflow bit.
 ;
-define i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @saddo1.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; CHECK-LABEL:  saddo.i32
-; CHECK:        adds w8, w0, w1
-; CHECK-NEXT:   cset w0, vs
+; CHECK-LABEL:  saddo1.i32
+; CHECK:        adds {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -15,11 +16,64 @@ entry:
   ret i1 %obit
 }
 
-define i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
+; Test the immediate version.
+define zeroext i1 @saddo2.i32(i32 %v1, i32* %res) {
 entry:
-; CHECK-LABEL:  saddo.i64
-; CHECK:        adds x8, x0, x1
-; CHECK-NEXT:   cset w0, vs
+; CHECK-LABEL:  saddo2.i32
+; CHECK:        adds {{w[0-9]+}}, w0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+; Test negative immediates.
+define zeroext i1 @saddo3.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo3.i32
+; CHECK:        subs {{w[0-9]+}}, w0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 -4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+; Test immediates that are too large to be encoded.
+define zeroext i1 @saddo4.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo4.i32
+; CHECK:        adds {{w[0-9]+}}, w0, {{w[0-9]+}}
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 16777215)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+; Test shift folding.
+define zeroext i1 @saddo5.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo5.i32
+; CHECK:        adds {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %lsl = shl i32 %v2, 16
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %lsl)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo1.i64
+; CHECK:        adds {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -27,11 +81,35 @@ entry:
   ret i1 %obit
 }
 
-define i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @saddo2.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo2.i64
+; CHECK:        adds {{x[0-9]+}}, x0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 4)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo3.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo3.i64
+; CHECK:        subs {{x[0-9]+}}, x0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -4)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  uaddo.i32
-; CHECK:        adds w8, w0, w1
-; CHECK-NEXT:   cset w0, hs
+; CHECK:        adds {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, hs
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -39,11 +117,11 @@ entry:
   ret i1 %obit
 }
 
-define i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  uaddo.i64
-; CHECK:        adds x8, x0, x1
-; CHECK-NEXT:   cset w0, hs
+; CHECK:        adds {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:   cset {{w[0-9]+}}, hs
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -51,11 +129,11 @@ entry:
   ret i1 %obit
 }
 
-define i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @ssubo1.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; CHECK-LABEL:  ssubo.i32
-; CHECK:        subs w8, w0, w1
-; CHECK-NEXT:   cset w0, vs
+; CHECK-LABEL:  ssubo1.i32
+; CHECK:        subs {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -63,11 +141,23 @@ entry:
   ret i1 %obit
 }
 
-define i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @ssubo2.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL:  ssubo2.i32
+; CHECK:        adds {{w[0-9]+}}, w0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 -4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  ssubo.i64
-; CHECK:        subs x8, x0, x1
-; CHECK-NEXT:   cset w0, vs
+; CHECK:        subs {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -75,11 +165,11 @@ entry:
   ret i1 %obit
 }
 
-define i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  usubo.i32
-; CHECK:        subs w8, w0, w1
-; CHECK-NEXT:   cset w0, lo
+; CHECK:        subs {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, lo
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -87,11 +177,11 @@ entry:
   ret i1 %obit
 }
 
-define i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  usubo.i64
-; CHECK:        subs x8, x0, x1
-; CHECK-NEXT:   cset w0, lo
+; CHECK:        subs {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:   cset {{w[0-9]+}}, lo
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -99,13 +189,13 @@ entry:
   ret i1 %obit
 }
 
-define i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  smulo.i32
-; CHECK:        smull x8, w0, w1
-; CHECK-NEXT:   lsr x9, x8, #32
-; CHECK-NEXT:   cmp w9, w8, asr #31
-; CHECK-NEXT:   cset w0, ne
+; CHECK:        smull x[[MREG:[0-9]+]], w0, w1
+; CHECK-NEXT:   lsr x[[SREG:[0-9]+]], x[[MREG]], #32
+; CHECK-NEXT:   cmp w[[SREG]], w[[MREG]], asr #31
+; CHECK-NEXT:   cset {{w[0-9]+}}, ne
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -113,13 +203,13 @@ entry:
   ret i1 %obit
 }
 
-define i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  smulo.i64
-; CHECK:        mul x8, x0, x1
-; CHECK-NEXT:   smulh x9, x0, x1
-; CHECK-NEXT:   cmp x9, x8, asr #63
-; CHECK-NEXT:   cset w0, ne
+; CHECK:        mul [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   smulh [[HREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp [[HREG]], [[MREG]], asr #63
+; CHECK-NEXT:   cset {{w[0-9]+}}, ne
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -127,12 +217,24 @@ entry:
   ret i1 %obit
 }
 
-define i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL:  smulo2.i64
+; CHECK:        adds [[MREG:x[0-9]+]], x0, x0
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  umulo.i32
-; CHECK:        umull x8, w0, w1
-; CHECK-NEXT:   cmp xzr, x8, lsr #32
-; CHECK-NEXT:   cset w0, ne
+; CHECK:        umull [[MREG:x[0-9]+]], w0, w1
+; CHECK-NEXT:   cmp xzr, [[MREG]], lsr #32
+; CHECK-NEXT:   cset {{w[0-9]+}}, ne
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -140,13 +242,12 @@ entry:
   ret i1 %obit
 }
 
-define i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  umulo.i64
-; CHECK:        umulh x8, x0, x1
-; CHECK-NEXT:   cmp xzr, x8
-; CHECK-NEXT:   cset w8, ne
-; CHECK-NEXT:   mul x9, x0, x1
+; CHECK:        umulh [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp xzr, [[MREG]]
+; CHECK-NEXT:   cset {{w[0-9]+}}, ne
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -154,6 +255,18 @@ entry:
   ret i1 %obit
 }
 
+define zeroext i1 @umulo2.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL:  umulo2.i64
+; CHECK:        adds [[MREG:x[0-9]+]], x0, x0
+; CHECK-NEXT:   cset {{w[0-9]+}}, hs
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
 
 ;
 ; Check the use of the overflow bit in combination with a select instruction.
@@ -249,9 +362,9 @@ entry:
 define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  smulo.select.i32
-; CHECK:        smull    x8, w0, w1
-; CHECK-NEXT:   lsr     x9, x8, #32
-; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK:        smull   x[[MREG:[0-9]+]], w0, w1
+; CHECK-NEXT:   lsr     x[[SREG:[0-9]+]], x[[MREG]], #32
+; CHECK-NEXT:   cmp     w[[SREG]], w[[MREG]], asr #31
 ; CHECK-NEXT:   csel    w0, w0, w1, ne
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
@@ -262,9 +375,9 @@ entry:
 define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  smulo.select.i64
-; CHECK:        mul      x8, x0, x1
-; CHECK-NEXT:   smulh   x9, x0, x1
-; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK:        mul     [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   smulh   [[HREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp     [[HREG]], [[MREG]], asr #63
 ; CHECK-NEXT:   csel    x0, x0, x1, ne
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
@@ -275,8 +388,8 @@ entry:
 define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  umulo.select.i32
-; CHECK:        umull    x8, w0, w1
-; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK:        umull   [[MREG:x[0-9]+]], w0, w1
+; CHECK-NEXT:   cmp     xzr, [[MREG]], lsr #32
 ; CHECK-NEXT:   csel    w0, w0, w1, ne
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
@@ -287,8 +400,8 @@ entry:
 define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  umulo.select.i64
-; CHECK:        umulh   x8, x0, x1
-; CHECK-NEXT:   cmp     xzr, x8
+; CHECK:        umulh   [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp     xzr, [[MREG]]
 ; CHECK-NEXT:   csel    x0, x0, x1, ne
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
@@ -300,7 +413,7 @@ entry:
 ;
 ; Check the use of the overflow bit in combination with a branch instruction.
 ;
-define i1 @saddo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  saddo.br.i32
 ; CHECK:        cmn w0, w1
@@ -317,7 +430,7 @@ continue:
   ret i1 true
 }
 
-define i1 @saddo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  saddo.br.i64
 ; CHECK:        cmn x0, x1
@@ -334,7 +447,7 @@ continue:
   ret i1 true
 }
 
-define i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  uaddo.br.i32
 ; CHECK:        cmn w0, w1
@@ -351,7 +464,7 @@ continue:
   ret i1 true
 }
 
-define i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  uaddo.br.i64
 ; CHECK:        cmn x0, x1
@@ -368,7 +481,7 @@ continue:
   ret i1 true
 }
 
-define i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  ssubo.br.i32
 ; CHECK:        cmp w0, w1
@@ -385,7 +498,7 @@ continue:
   ret i1 true
 }
 
-define i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  ssubo.br.i64
 ; CHECK:        cmp x0, x1
@@ -402,7 +515,7 @@ continue:
   ret i1 true
 }
 
-define i1 @usubo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  usubo.br.i32
 ; CHECK:        cmp w0, w1
@@ -419,7 +532,7 @@ continue:
   ret i1 true
 }
 
-define i1 @usubo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  usubo.br.i64
 ; CHECK:        cmp x0, x1
@@ -436,12 +549,12 @@ continue:
   ret i1 true
 }
 
-define i1 @smulo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  smulo.br.i32
-; CHECK:        smull    x8, w0, w1
-; CHECK-NEXT:   lsr     x9, x8, #32
-; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK:        smull   x[[MREG:[0-9]+]], w0, w1
+; CHECK-NEXT:   lsr     x[[SREG:[0-9]+]], x8, #32
+; CHECK-NEXT:   cmp     w[[SREG]], w[[MREG]], asr #31
 ; CHECK-NEXT:   b.eq
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
@@ -455,12 +568,12 @@ continue:
   ret i1 true
 }
 
-define i1 @smulo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  smulo.br.i64
-; CHECK:        mul      x8, x0, x1
-; CHECK-NEXT:   smulh   x9, x0, x1
-; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK:        mul     [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   smulh   [[HREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp     [[HREG]], [[MREG]], asr #63
 ; CHECK-NEXT:   b.eq
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
@@ -474,11 +587,28 @@ continue:
   ret i1 true
 }
 
-define i1 @umulo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @smulo2.br.i64(i64 %v1) {
+entry:
+; CHECK-LABEL:  smulo2.br.i64
+; CHECK:        cmn  x0, x0
+; CHECK-NEXT:   b.vc
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  umulo.br.i32
-; CHECK:        umull    x8, w0, w1
-; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK:        umull   [[MREG:x[0-9]+]], w0, w1
+; CHECK-NEXT:   cmp     xzr, [[MREG]], lsr #32
 ; CHECK-NEXT:   b.eq
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
@@ -492,11 +622,11 @@ continue:
   ret i1 true
 }
 
-define i1 @umulo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  umulo.br.i64
-; CHECK:        umulh   x8, x0, x1
-; CHECK-NEXT:   cbz
+; CHECK:        umulh   [[REG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   {{cbz|cmp}}
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -509,6 +639,23 @@ continue:
   ret i1 true
 }
 
+define zeroext i1 @umulo2.br.i64(i64 %v1) {
+entry:
+; CHECK-LABEL:  umulo2.br.i64
+; CHECK:        cmn  x0, x0
+; CHECK-NEXT:   b.lo
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
 declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index 26301b9..ef209e9 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -509,7 +509,7 @@ define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i8 %old
 }
 
@@ -534,7 +534,7 @@ define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i16 %old
 }
 
@@ -607,7 +607,7 @@ define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i8 %old
 }
 
@@ -632,7 +632,7 @@ define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i16 %old
 }
 
diff --git a/test/CodeGen/AArch64/br-undef-cond.ll b/test/CodeGen/AArch64/br-undef-cond.ll
new file mode 100644
index 0000000..12d0da2
--- /dev/null
+++ b/test/CodeGen/AArch64/br-undef-cond.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -verify-machineinstrs
+
+; Make sure we don't end up with a CBNZ of an undef v-/phys-reg.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+declare void @bar(i8*)
+
+define void @foo(i8* %m, i32 %off0) {
+.thread1653:
+  br i1 undef, label %0, label %.thread1880
+
+  %1 = icmp eq i32 undef, 0
+  %.not = xor i1 %1, true
+  %brmerge = or i1 %.not, undef
+  br i1 %brmerge, label %.thread1880, label %.thread1705
+
+.thread1705:
+  ret void
+
+.thread1880:
+  %m1652.ph = phi i8* [ %m, %0 ], [ null, %.thread1653 ]
+  call void @bar(i8* %m1652.ph)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/cmp-const-max.ll b/test/CodeGen/AArch64/cmp-const-max.ll
new file mode 100644
index 0000000..0431e39
--- /dev/null
+++ b/test/CodeGen/AArch64/cmp-const-max.ll
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -aarch64-atomic-cfg-tidy=0 < %s -mtriple=aarch64-none-eabihf -fast-isel=false | FileCheck %s
+
+
+define i32 @ule_64_max(i64 %p) {
+entry:
+; CHECK-LABEL: ule_64_max:
+; CHECK: cmn x0, #1
+; CHECK: b.hi [[RET_ZERO:.LBB[0-9]+_[0-9]+]]
+  %cmp = icmp ule i64 %p, 18446744073709551615 ; 0xffffffffffffffff
+  br i1 %cmp, label %ret_one, label %ret_zero
+
+ret_one:
+  ret i32 1
+
+ret_zero:
+; CHECK: [[RET_ZERO]]:
+; CHECK-NEXT: mov w0, wzr
+  ret i32 0
+}
+
+define i32 @ugt_64_max(i64 %p) {
+entry:
+; CHECK-LABEL: ugt_64_max:
+; CHECK: cmn x0, #1
+; CHECK: b.ls [[RET_ZERO:.LBB[0-9]+_[0-9]+]]
+  %cmp = icmp ugt i64 %p, 18446744073709551615 ; 0xffffffffffffffff
+  br i1 %cmp, label %ret_one, label %ret_zero
+
+ret_one:
+  ret i32 1
+
+ret_zero:
+; CHECK: [[RET_ZERO]]:
+; CHECK-NEXT: mov w0, wzr
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/cmpwithshort.ll b/test/CodeGen/AArch64/cmpwithshort.ll
new file mode 100644
index 0000000..14efdcc
--- /dev/null
+++ b/test/CodeGen/AArch64/cmpwithshort.ll
@@ -0,0 +1,46 @@
+; RUN: llc -O3 -march=aarch64 < %s | FileCheck %s 
+
+define i16 @test_1cmp_signed_1(i16* %ptr1) {
+; CHECK-LABLE: @test_1cmp_signed_1
+; CHECK: ldrsh
+; CHECK-NEXT: cmn
+entry:
+  %addr = getelementptr inbounds i16* %ptr1, i16 0
+  %val = load i16* %addr, align 2
+  %cmp = icmp eq i16 %val, -1
+  br i1 %cmp, label %if, label %if.then
+if:
+  ret i16 1
+if.then:
+  ret i16 0
+}
+
+define i16 @test_1cmp_signed_2(i16* %ptr1) {
+; CHECK-LABLE: @test_1cmp_signed_2
+; CHECK: ldrsh
+; CHECK-NEXT: cmn
+entry:
+  %addr = getelementptr inbounds i16* %ptr1, i16 0
+  %val = load i16* %addr, align 2
+  %cmp = icmp sge i16 %val, -1
+  br i1 %cmp, label %if, label %if.then
+if:
+  ret i16 1
+if.then:
+  ret i16 0
+}
+
+define i16 @test_1cmp_unsigned_1(i16* %ptr1) {
+; CHECK-LABLE: @test_1cmp_unsigned_1
+; CHECK: ldrsh
+; CHECK-NEXT: cmn
+entry:
+  %addr = getelementptr inbounds i16* %ptr1, i16 0
+  %val = load i16* %addr, align 2
+  %cmp = icmp uge i16 %val, -1
+  br i1 %cmp, label %if, label %if.then
+if:
+  ret i16 1
+if.then:
+  ret i16 0
+}                                           
diff --git a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
new file mode 100644
index 0000000..df8dc87
--- /dev/null
+++ b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -0,0 +1,413 @@
+; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s
+
+; marked as external to prevent possible optimizations
+@a = external global i32
+@b = external global i32
+@c = external global i32
+@d = external global i32
+
+; (a > 10 && b == c) || (a >= 10 && b == d)
+define i32 @combine_gt_ge_10() #0 {
+; CHECK-LABEL: combine_gt_ge_10
+; CHECK: cmp
+; CHECK: b.le
+; CHECK: ret
+; CHECK-NOT: cmp
+; CHECK: b.lt
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp sgt i32 %0, 10
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %land.lhs.true3
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp sgt i32 %0, 9
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false, %land.lhs.true
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a > 5 && b == c) || (a < 5 && b == d)
+define i32 @combine_gt_lt_5() #0 {
+; CHECK-LABEL: combine_gt_lt_5
+; CHECK: cmp
+; CHECK: b.le
+; CHECK: ret
+; CHECK-NOT: cmp
+; CHECK: b.ge
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp sgt i32 %0, 5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp slt i32 %0, 5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false, %land.lhs.true
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a < 5 && b == c) || (a <= 5 && b == d)
+define i32 @combine_lt_ge_5() #0 {
+; CHECK-LABEL: combine_lt_ge_5
+; CHECK: cmp
+; CHECK: b.ge
+; CHECK: ret
+; CHECK-NOT: cmp
+; CHECK: b.gt
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, 5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %land.lhs.true3
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp slt i32 %0, 6
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false, %land.lhs.true
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a < 5 && b == c) || (a > 5 && b == d)
+define i32 @combine_lt_gt_5() #0 {
+; CHECK-LABEL: combine_lt_gt_5
+; CHECK: cmp
+; CHECK: b.ge
+; CHECK: ret
+; CHECK-NOT: cmp
+; CHECK: b.le
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, 5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp sgt i32 %0, 5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false, %land.lhs.true
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a > -5 && b == c) || (a < -5 && b == d)
+define i32 @combine_gt_lt_n5() #0 {
+; CHECK-LABEL: combine_gt_lt_n5
+; CHECK: cmn
+; CHECK: b.le
+; CHECK: ret
+; CHECK-NOT: cmn
+; CHECK: b.ge
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp sgt i32 %0, -5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp slt i32 %0, -5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false, %land.lhs.true
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a < -5 && b == c) || (a > -5 && b == d)
+define i32 @combine_lt_gt_n5() #0 {
+; CHECK-LABEL: combine_lt_gt_n5
+; CHECK: cmn
+; CHECK: b.ge
+; CHECK: ret
+; CHECK-NOT: cmn
+; CHECK: b.le
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, -5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp sgt i32 %0, -5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false, %land.lhs.true
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+%struct.Struct = type { i64, i64 }
+
+@glob = internal unnamed_addr global %struct.Struct* null, align 8
+
+declare %struct.Struct* @Update(%struct.Struct*) #1
+
+; no checks for this case, it just should be processed without errors
+define void @combine_non_adjacent_cmp_br(%struct.Struct* nocapture readonly %hdCall) #0 {
+entry:
+  %size = getelementptr inbounds %struct.Struct* %hdCall, i64 0, i32 0
+  %0 = load i64* %size, align 8
+  br label %land.rhs
+
+land.rhs:
+  %rp.06 = phi i64 [ %0, %entry ], [ %sub, %while.body ]
+  %1 = load i64* inttoptr (i64 24 to i64*), align 8
+  %cmp2 = icmp sgt i64 %1, 0
+  br i1 %cmp2, label %while.body, label %while.end
+
+while.body:
+  %2 = load %struct.Struct** @glob, align 8
+  %call = tail call %struct.Struct* @Update(%struct.Struct* %2) #2
+  %sub = add nsw i64 %rp.06, -2
+  %cmp = icmp slt i64 %0, %rp.06
+  br i1 %cmp, label %land.rhs, label %while.end
+
+while.end:
+  ret void
+}
+
+; undefined external to prevent possible optimizations
+declare void @do_something() #1
+
+define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
+; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ
+; CHECK: cmn
+; CHECK: b.gt
+; CHECK: cmp
+; CHECK: b.gt
+entry:
+  %0 = load i32* @a, align 4
+  %cmp4 = icmp slt i32 %0, -1
+  br i1 %cmp4, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %while.body.preheader
+  %i.05 = phi i32 [ %inc, %while.body ], [ %0, %while.body.preheader ]
+  tail call void @do_something() #2
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %i.05, 0
+  br i1 %cmp, label %while.body, label %while.cond.while.end_crit_edge
+
+while.cond.while.end_crit_edge:                   ; preds = %while.body
+  %.pre = load i32* @a, align 4
+  br label %while.end
+
+while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
+  %1 = phi i32 [ %.pre, %while.cond.while.end_crit_edge ], [ %0, %entry ]
+  %cmp1 = icmp slt i32 %1, 2
+  br i1 %cmp1, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %while.end
+  %2 = load i32* @b, align 4
+  %3 = load i32* @d, align 4
+  %cmp2 = icmp eq i32 %2, %3
+  br i1 %cmp2, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true, %while.end
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 123, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+define i32 @do_nothing_if_compares_can_not_be_adjusted_to_each_other() #0 {
+; CHECK-LABEL: do_nothing_if_compares_can_not_be_adjusted_to_each_other
+; CHECK: cmp
+; CHECK: b.gt
+; CHECK: cmn
+; CHECK: b.lt
+entry:
+  %0 = load i32* @a, align 4
+  %cmp4 = icmp slt i32 %0, 1
+  br i1 %cmp4, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %while.body.preheader
+  %i.05 = phi i32 [ %inc, %while.body ], [ %0, %while.body.preheader ]
+  tail call void @do_something() #2
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %i.05, 0
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %1 = load i32* @c, align 4
+  %cmp1 = icmp sgt i32 %1, -3
+  br i1 %cmp1, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %while.end
+  %2 = load i32* @b, align 4
+  %3 = load i32* @d, align 4
+  %cmp2 = icmp eq i32 %2, %3
+  br i1 %cmp2, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true, %while.end
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 123, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; Test in the following case, we don't hit 'cmp' and trigger a false positive
+; cmp  w19, #0
+; cinc w0, w19, gt
+; ...
+; fcmp d8, #0.0
+; b.gt .LBB0_5
+
+define i32 @fcmpri(i32 %argc, i8** nocapture readonly %argv) {
+
+; CHECK-LABEL: fcmpri:
+; CHECK: cmp w0, #2
+; CHECK: b.lt .LBB9_3
+; CHECK-NOT: cmp w0, #1
+; CHECK-NOT: b.le .LBB9_3
+
+; CHECK-LABEL-DAG: .LBB9_3
+; CHECK: cmp w19, #0
+; CHECK: fcmp d8, #0.0
+; CHECK: b.gt .LBB9_5
+; CHECK-NOT: cmp w19, #1
+; CHECK-NOT: b.ge .LBB9_5
+
+entry:
+  %cmp = icmp sgt i32 %argc, 1
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %arrayidx = getelementptr inbounds i8** %argv, i64 1
+  %0 = load i8** %arrayidx, align 8
+  %cmp1 = icmp eq i8* %0, null
+  br i1 %cmp1, label %if.end, label %return
+
+if.end:                                           ; preds = %land.lhs.true, %entry
+  %call = call i32 @zoo(i32 1)
+  %call2 = call double @yoo(i32 -1)
+  %cmp4 = icmp sgt i32 %call, 0
+  %add = zext i1 %cmp4 to i32
+  %cond = add nsw i32 %add, %call
+  %call7 = call i32 @xoo(i32 %cond, i32 2)
+  %cmp9 = fcmp ogt double %call2, 0.000000e+00
+  br i1 %cmp9, label %cond.end14, label %cond.false12
+
+cond.false12:                                     ; preds = %if.end
+  %sub = fadd fast double %call2, -1.000000e+00
+  br label %cond.end14
+
+cond.end14:                                       ; preds = %if.end, %cond.false12
+  %cond15 = phi double [ %sub, %cond.false12 ], [ %call2, %if.end ]
+  %call16 = call i32 @woo(double %cond15, double -2.000000e+00)
+  br label %return
+
+return:                                           ; preds = %land.lhs.true, %cond.end14
+  %retval.0 = phi i32 [ 4, %cond.end14 ], [ 3, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+declare i32 @zoo(i32)
+
+declare double @yoo(i32)
+
+declare i32 @xoo(i32, i32)
+
+declare i32 @woo(double, double)
diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll
index 5f81cba..dfc83aa 100644
--- a/test/CodeGen/AArch64/cond-sel.ll
+++ b/test/CodeGen/AArch64/cond-sel.ll
@@ -214,3 +214,20 @@ define void @test_csetm(i32 %lhs, i32 %rhs, i64 %lhs64) {
   ret void
 ; CHECK: ret
 }
+
+define <1 x i1> @test_wide_comparison(i32 %in) {
+; CHECK-LABEL: test_wide_comparison:
+; CHECK: cmp w0, #1234
+; CHECK: cset
+
+  %tmp = icmp sgt i32 %in, 1234
+  %res = select i1 %tmp, <1 x i1> <i1 1>, <1 x i1> zeroinitializer
+  ret <1 x i1> %res
+}
+
+define i32 @test_select_undef() {
+; CHECK-LABEL: test_select_undef:
+; CHECK: ret
+  %res = select i1 undef, i32 0, i32 42
+  ret i32 %res
+}
diff --git a/test/CodeGen/AArch64/dag-combine-invaraints.ll b/test/CodeGen/AArch64/dag-combine-invaraints.ll
new file mode 100644
index 0000000..115fc64
--- /dev/null
+++ b/test/CodeGen/AArch64/dag-combine-invaraints.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=arm64-apple-darwin8.0 -relocation-model=pic -O1 < %s | FileCheck %s
+
+@.str2 = private unnamed_addr constant [9 x i8] c"_%d____\0A\00", align 1
+
+; Function Attrs: nounwind ssp
+define i32 @main(i32 %argc, i8** %argv) #0 {
+main_:
+  %tmp = alloca i32, align 4
+  %i32T = alloca i32, align 4
+  %i32F = alloca i32, align 4
+  %i32X = alloca i32, align 4
+  store i32 0, i32* %tmp
+  store i32 15, i32* %i32T, align 4
+  store i32 5, i32* %i32F, align 4
+  %tmp6 = load i32* %tmp, align 4
+  %tmp7 = icmp ne i32 %tmp6, 0
+  %tmp8 = xor i1 %tmp7, true
+  %tmp9 = load i32* %i32T, align 4
+  %tmp10 = load i32* %i32F, align 4
+  %DHSelect = select i1 %tmp8, i32 %tmp9, i32 %tmp10
+  store i32 %DHSelect, i32* %i32X, align 4
+  %tmp15 = load i32* %i32X, align 4
+  %tmp17 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str2, i32 0, i32 0), i32 %tmp15)
+  ret i32 0
+
+; CHECK: main:
+; CHECK-DAG: movz
+; CHECK-DAG: orr
+; CHECK: csel
+}
+
+
+declare i32 @printf(i8*, ...) #1
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/AArch64/dont-take-over-the-world.ll b/test/CodeGen/AArch64/dont-take-over-the-world.ll
new file mode 100644
index 0000000..d9e13b7
--- /dev/null
+++ b/test/CodeGen/AArch64/dont-take-over-the-world.ll
@@ -0,0 +1,7 @@
+; RUN: not llc -mtriple=x86-64 2>&1 | FileCheck %s
+
+; To support "arm64" as a -march option, we need to register a second AArch64
+; target, but we have to be careful how we do that so that it doesn't become the
+; target of last resort when the specified triple is completely wrong.
+
+; CHECK: unable to get target for 'x86-64', see --version and --triple.
diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll
index ce5c0f6..f647c4b 100644
--- a/test/CodeGen/AArch64/extern-weak.ll
+++ b/test/CodeGen/AArch64/extern-weak.ll
@@ -1,17 +1,24 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK-STATIC %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
 
 define i32()* @foo() {
 ; The usual ADRP/ADD pair can't be used for a weak reference because it must
-; evaluate to 0 if the symbol is undefined. We use a litpool entry.
+; evaluate to 0 if the symbol is undefined. We use a GOT entry for PIC
+; otherwise a litpool entry.
   ret i32()* @var
 
 
 ; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:var
 ; CHECK: ldr x0, [x[[ADDRHI]], :got_lo12:var]
 
+; CHECK-STATIC: .LCPI0_0:
+; CHECK-STATIC-NEXT: .xword  var
+; CHECK-STATIC: adrp x[[VAR:[0-9]+]], .LCPI0_0
+; CHECK-STATIC: ldr x0, [x[[VAR]], :lo12:.LCPI0_0]
+
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
 ; CHECK-LARGE: movz x0, #:abs_g3:var
@@ -31,6 +38,11 @@ define i32* @bar() {
 ; CHECK: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
 ; CHECK: add x0, [[BASE]], #20
 
+; CHECK-STATIC: .LCPI1_0:
+; CHECK-STATIC-NEXT: .xword arr_var
+; CHECK-STATIC: ldr [[BASE:x[0-9]+]], [{{x[0-9]+}}, :lo12:.LCPI1_0]
+; CHECK-STATIC: add x0, [[BASE]], #20
+
   ret i32* %addr
 
   ; In the large model, the usual relocations are absolute and can
@@ -49,6 +61,9 @@ define i32* @wibble() {
 ; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
 ; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
 
+; CHECK-STATIC: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK-STATIC: add x0, [[BASE]], :lo12:defined_weak_var
+
 ; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var
diff --git a/test/CodeGen/AArch64/f16-convert.ll b/test/CodeGen/AArch64/f16-convert.ll
index 6fabdc5..12412d4 100644
--- a/test/CodeGen/AArch64/f16-convert.ll
+++ b/test/CodeGen/AArch64/f16-convert.ll
@@ -7,7 +7,7 @@ define float @load0(i16* nocapture readonly %a) nounwind {
 ; CHECK-NEXT: ret
 
   %tmp = load i16* %a, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -18,8 +18,7 @@ define double @load1(i16* nocapture readonly %a) nounwind {
 ; CHECK-NEXT: ret
 
   %tmp = load i16* %a, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -32,7 +31,7 @@ define float @load2(i16* nocapture readonly %a, i32 %i) nounwind {
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -45,8 +44,7 @@ define double @load3(i16* nocapture readonly %a, i32 %i) nounwind {
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -58,7 +56,7 @@ define float @load4(i16* nocapture readonly %a, i64 %i) nounwind {
 
   %arrayidx = getelementptr inbounds i16* %a, i64 %i
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -70,8 +68,7 @@ define double @load5(i16* nocapture readonly %a, i64 %i) nounwind {
 
   %arrayidx = getelementptr inbounds i16* %a, i64 %i
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -83,7 +80,7 @@ define float @load6(i16* nocapture readonly %a) nounwind {
 
   %arrayidx = getelementptr inbounds i16* %a, i64 10
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -95,8 +92,7 @@ define double @load7(i16* nocapture readonly %a) nounwind {
 
   %arrayidx = getelementptr inbounds i16* %a, i64 10
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -108,7 +104,7 @@ define float @load8(i16* nocapture readonly %a) nounwind {
 
   %arrayidx = getelementptr inbounds i16* %a, i64 -10
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -120,8 +116,7 @@ define double @load9(i16* nocapture readonly %a) nounwind {
 
   %arrayidx = getelementptr inbounds i16* %a, i64 -10
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -131,7 +126,7 @@ define void @store0(i16* nocapture %a, float %val) nounwind {
 ; CHECK-NEXT: str  h0, [x0]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   store i16 %tmp, i16* %a, align 2
   ret void
 }
@@ -143,7 +138,7 @@ define void @store1(i16* nocapture %a, double %val) nounwind {
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   store i16 %tmp, i16* %a, align 2
   ret void
 }
@@ -154,7 +149,7 @@ define void @store2(i16* nocapture %a, i32 %i, float %val) nounwind {
 ; CHECK-NEXT: str h0, [x0, w1, sxtw #1]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
   store i16 %tmp, i16* %arrayidx, align 2
@@ -168,7 +163,7 @@ define void @store3(i16* nocapture %a, i32 %i, double %val) nounwind {
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
   store i16 %tmp, i16* %arrayidx, align 2
@@ -181,7 +176,7 @@ define void @store4(i16* nocapture %a, i64 %i, float %val) nounwind {
 ; CHECK-NEXT: str h0, [x0, x1, lsl #1]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   %arrayidx = getelementptr inbounds i16* %a, i64 %i
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -194,7 +189,7 @@ define void @store5(i16* nocapture %a, i64 %i, double %val) nounwind {
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   %arrayidx = getelementptr inbounds i16* %a, i64 %i
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -206,7 +201,7 @@ define void @store6(i16* nocapture %a, float %val) nounwind {
 ; CHECK-NEXT: str h0, [x0, #20]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   %arrayidx = getelementptr inbounds i16* %a, i64 10
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -219,7 +214,7 @@ define void @store7(i16* nocapture %a, double %val) nounwind {
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   %arrayidx = getelementptr inbounds i16* %a, i64 10
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -231,7 +226,7 @@ define void @store8(i16* nocapture %a, float %val) nounwind {
 ; CHECK-NEXT: stur h0, [x0, #-20]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   %arrayidx = getelementptr inbounds i16* %a, i64 -10
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -244,11 +239,13 @@ define void @store9(i16* nocapture %a, double %val) nounwind {
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   %arrayidx = getelementptr inbounds i16* %a, i64 -10
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
 }
 
-declare i16 @llvm.convert.to.fp16(float) nounwind readnone
-declare float @llvm.convert.from.fp16(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
diff --git a/test/CodeGen/AArch64/fast-isel-addressing-modes.ll b/test/CodeGen/AArch64/fast-isel-addressing-modes.ll
new file mode 100644
index 0000000..d86f00d
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-addressing-modes.ll
@@ -0,0 +1,627 @@
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
+
+; Load / Store Base Register only
+define zeroext i1 @load_breg_i1(i1* %a) {
+; CHECK-LABEL: load_breg_i1
+; CHECK:       ldrb {{w[0-9]+}}, [x0]
+  %1 = load i1* %a
+  ret i1 %1
+}
+
+define zeroext i8 @load_breg_i8(i8* %a) {
+; CHECK-LABEL: load_breg_i8
+; CHECK:       ldrb {{w[0-9]+}}, [x0]
+  %1 = load i8* %a
+  ret i8 %1
+}
+
+define zeroext i16 @load_breg_i16(i16* %a) {
+; CHECK-LABEL: load_breg_i16
+; CHECK:       ldrh {{w[0-9]+}}, [x0]
+  %1 = load i16* %a
+  ret i16 %1
+}
+
+define i32 @load_breg_i32(i32* %a) {
+; CHECK-LABEL: load_breg_i32
+; CHECK:       ldr {{w[0-9]+}}, [x0]
+  %1 = load i32* %a
+  ret i32 %1
+}
+
+define i64 @load_breg_i64(i64* %a) {
+; CHECK-LABEL: load_breg_i64
+; CHECK:       ldr {{x[0-9]+}}, [x0]
+  %1 = load i64* %a
+  ret i64 %1
+}
+
+define float @load_breg_f32(float* %a) {
+; CHECK-LABEL: load_breg_f32
+; CHECK:       ldr {{s[0-9]+}}, [x0]
+  %1 = load float* %a
+  ret float %1
+}
+
+define double @load_breg_f64(double* %a) {
+; CHECK-LABEL: load_breg_f64
+; CHECK:       ldr {{d[0-9]+}}, [x0]
+  %1 = load double* %a
+  ret double %1
+}
+
+define void @store_breg_i1(i1* %a) {
+; CHECK-LABEL: store_breg_i1
+; CHECK:       strb wzr, [x0]
+  store i1 0, i1* %a
+  ret void
+}
+
+define void @store_breg_i1_2(i1* %a) {
+; CHECK-LABEL: store_breg_i1_2
+; CHECK:       strb {{w[0-9]+}}, [x0]
+  store i1 true, i1* %a
+  ret void
+}
+
+define void @store_breg_i8(i8* %a) {
+; CHECK-LABEL: store_breg_i8
+; CHECK:       strb wzr, [x0]
+  store i8 0, i8* %a
+  ret void
+}
+
+define void @store_breg_i16(i16* %a) {
+; CHECK-LABEL: store_breg_i16
+; CHECK:       strh wzr, [x0]
+  store i16 0, i16* %a
+  ret void
+}
+
+define void @store_breg_i32(i32* %a) {
+; CHECK-LABEL: store_breg_i32
+; CHECK:       str wzr, [x0]
+  store i32 0, i32* %a
+  ret void
+}
+
+define void @store_breg_i64(i64* %a) {
+; CHECK-LABEL: store_breg_i64
+; CHECK:       str xzr, [x0]
+  store i64 0, i64* %a
+  ret void
+}
+
+define void @store_breg_f32(float* %a) {
+; CHECK-LABEL: store_breg_f32
+; CHECK:       str wzr, [x0]
+  store float 0.0, float* %a
+  ret void
+}
+
+define void @store_breg_f64(double* %a) {
+; CHECK-LABEL: store_breg_f64
+; CHECK:       str xzr, [x0]
+  store double 0.0, double* %a
+  ret void
+}
+
+; Load Immediate
+define i32 @load_immoff_1() {
+; CHECK-LABEL: load_immoff_1
+; CHECK:       orr {{w|x}}[[REG:[0-9]+]], {{wzr|xzr}}, #0x80
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}x[[REG]]{{\]}}
+  %1 = inttoptr i64 128 to i32*
+  %2 = load i32* %1
+  ret i32 %2
+}
+
+; Load / Store Base Register + Immediate Offset
+; Max supported negative offset
+define i32 @load_breg_immoff_1(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_1
+; CHECK:       ldur {{w[0-9]+}}, [x0, #-256]
+  %1 = add i64 %a, -256
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Min not-supported negative offset
+define i32 @load_breg_immoff_2(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_2
+; CHECK:       sub [[REG:x[0-9]+]], x0, #257
+; CHECK-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, -257
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Max supported unscaled offset
+define i32 @load_breg_immoff_3(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_3
+; CHECK:       ldur {{w[0-9]+}}, [x0, #255]
+  %1 = add i64 %a, 255
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Min un-supported unscaled offset
+define i32 @load_breg_immoff_4(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_4
+; CHECK:       add [[REG:x[0-9]+]], x0, #257
+; CHECK-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, 257
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Max supported scaled offset
+define i32 @load_breg_immoff_5(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_5
+; CHECK:       ldr {{w[0-9]+}}, [x0, #16380]
+  %1 = add i64 %a, 16380
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Min un-supported scaled offset
+define i32 @load_breg_immoff_6(i64 %a) {
+; SDAG-LABEL: load_breg_immoff_6
+; SDAG:       orr	w[[NUM:[0-9]+]], wzr, #0x4000
+; SDAG-NEXT:  ldr {{w[0-9]+}}, [x0, x[[NUM]]]
+; FAST-LABEL: load_breg_immoff_6
+; FAST:       add [[REG:x[0-9]+]], x0, #4, lsl #12
+; FAST-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, 16384
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Max supported negative offset
+define void @store_breg_immoff_1(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_1
+; CHECK:       stur wzr, [x0, #-256]
+  %1 = add i64 %a, -256
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Min not-supported negative offset
+define void @store_breg_immoff_2(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_2
+; CHECK:       sub [[REG:x[0-9]+]], x0, #257
+; CHECK-NEXT:  str wzr, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, -257
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Max supported unscaled offset
+define void @store_breg_immoff_3(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_3
+; CHECK:       stur wzr, [x0, #255]
+  %1 = add i64 %a, 255
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Min un-supported unscaled offset
+define void @store_breg_immoff_4(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_4
+; CHECK:       add [[REG:x[0-9]+]], x0, #257
+; CHECK-NEXT:  str wzr, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, 257
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Max supported scaled offset
+define void @store_breg_immoff_5(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_5
+; CHECK:       str wzr, [x0, #16380]
+  %1 = add i64 %a, 16380
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Min un-supported scaled offset
+define void @store_breg_immoff_6(i64 %a) {
+; SDAG-LABEL: store_breg_immoff_6
+; SDAG:       orr	w[[NUM:[0-9]+]], wzr, #0x4000
+; SDAG-NEXT:  str wzr, [x0, x[[NUM]]]
+; FAST-LABEL: store_breg_immoff_6
+; FAST:       add [[REG:x[0-9]+]], x0, #4, lsl #12
+; FAST-NEXT:  str wzr, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, 16384
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+define i64 @load_breg_immoff_7(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_7
+; CHECK:       ldr {{x[0-9]+}}, [x0, #48]
+  %1 = add i64 %a, 48
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load i64* %2
+  ret i64 %3
+}
+
+; Flip add operands
+define i64 @load_breg_immoff_8(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_8
+; CHECK:       ldr {{x[0-9]+}}, [x0, #48]
+  %1 = add i64 48, %a
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load i64* %2
+  ret i64 %3
+}
+
+; Load Base Register + Register Offset
+define i64 @load_breg_offreg_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_offreg_1
+; CHECK:       ldr {{x[0-9]+}}, [x0, x1]
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load i64* %2
+  ret i64 %3
+}
+
+; Flip add operands
+define i64 @load_breg_offreg_2(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_offreg_2
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0]
+  %1 = add i64 %b, %a
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load i64* %2
+  ret i64 %3
+}
+
+; Load Base Register + Register Offset + Immediate Offset
+define i64 @load_breg_offreg_immoff_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_offreg_immoff_1
+; CHECK:       add [[REG:x[0-9]+]], x0, x1
+; CHECK-NEXT:  ldr x0, {{\[}}[[REG]], #48{{\]}}
+  %1 = add i64 %a, %b
+  %2 = add i64 %1, 48
+  %3 = inttoptr i64 %2 to i64*
+  %4 = load i64* %3
+  ret i64 %4
+}
+
+define i64 @load_breg_offreg_immoff_2(i64 %a, i64 %b) {
+; SDAG-LABEL: load_breg_offreg_immoff_2
+; SDAG:       add [[REG1:x[0-9]+]], x0, x1
+; SDAG-NEXT:  orr w[[NUM:[0-9]+]], wzr, #0xf000
+; SDAG-NEXT:  ldr x0, {{\[}}[[REG1]], x[[NUM]]]
+; FAST-LABEL: load_breg_offreg_immoff_2
+; FAST:       add [[REG:x[0-9]+]], x0, #15, lsl #12
+; FAST-NEXT:  ldr x0, {{\[}}[[REG]], x1{{\]}}
+  %1 = add i64 %a, %b
+  %2 = add i64 %1, 61440
+  %3 = inttoptr i64 %2 to i64*
+  %4 = load i64* %3
+  ret i64 %4
+}
+
+; Load Scaled Register Offset
+define i32 @load_shift_offreg_1(i64 %a) {
+; CHECK-LABEL: load_shift_offreg_1
+; CHECK:       lsl [[REG:x[0-9]+]], x0, #2
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = shl i64 %a, 2
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+define i32 @load_mul_offreg_1(i64 %a) {
+; CHECK-LABEL: load_mul_offreg_1
+; CHECK:       lsl [[REG:x[0-9]+]], x0, #2
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = mul i64 %a, 4
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Load Base Register + Scaled Register Offset
+define i32 @load_breg_shift_offreg_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_shift_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, x0, lsl #2]
+  %1 = shl i64 %a, 2
+  %2 = add i64 %1, %b
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  ret i32 %4
+}
+
+define i32 @load_breg_shift_offreg_2(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_shift_offreg_2
+; CHECK:       ldr {{w[0-9]+}}, [x1, x0, lsl #2]
+  %1 = shl i64 %a, 2
+  %2 = add i64 %b, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  ret i32 %4
+}
+
+define i32 @load_breg_shift_offreg_3(i64 %a, i64 %b) {
+; SDAG-LABEL: load_breg_shift_offreg_3
+; SDAG:       lsl [[REG:x[0-9]+]], x0, #2
+; SDAG-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x1, lsl #2{{\]}}
+; FAST-LABEL: load_breg_shift_offreg_3
+; FAST:       lsl [[REG:x[0-9]+]], x1, #2
+; FAST-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+  %1 = shl i64 %a, 2
+  %2 = shl i64 %b, 2
+  %3 = add i64 %1, %2
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_shift_offreg_4(i64 %a, i64 %b) {
+; SDAG-LABEL: load_breg_shift_offreg_4
+; SDAG:       lsl [[REG:x[0-9]+]], x1, #2
+; SDAG-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+; FAST-LABEL: load_breg_shift_offreg_4
+; FAST:       lsl [[REG:x[0-9]+]], x0, #2
+; FAST-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x1, lsl #2{{\]}}
+  %1 = shl i64 %a, 2
+  %2 = shl i64 %b, 2
+  %3 = add i64 %2, %1
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_shift_offreg_5(i64 %a, i64 %b) {
+; SDAG-LABEL: load_breg_shift_offreg_5
+; SDAG:       lsl [[REG:x[0-9]+]], x1, #3
+; SDAG-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+; FAST-LABEL: load_breg_shift_offreg_5
+; FAST:       lsl [[REG:x[0-9]+]], x1, #3
+; FAST-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+  %1 = shl i64 %a, 2
+  %2 = shl i64 %b, 3
+  %3 = add i64 %1, %2
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_mul_offreg_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_mul_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, x0, lsl #2]
+  %1 = mul i64 %a, 4
+  %2 = add i64 %1, %b
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  ret i32 %4
+}
+
+define zeroext i8 @load_breg_and_offreg_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_and_offreg_1
+; CHECK:       ldrb {{w[0-9]+}}, [x1, w0, uxtw]
+  %1 = and i64 %a, 4294967295
+  %2 = add i64 %1, %b
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  ret i8 %4
+}
+
+define zeroext i16 @load_breg_and_offreg_2(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_and_offreg_2
+; CHECK:       ldrh {{w[0-9]+}}, [x1, w0, uxtw #1]
+  %1 = and i64 %a, 4294967295
+  %2 = shl i64 %1, 1
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i16*
+  %5 = load i16* %4
+  ret i16 %5
+}
+
+define i32 @load_breg_and_offreg_3(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_and_offreg_3
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, uxtw #2]
+  %1 = and i64 %a, 4294967295
+  %2 = shl i64 %1, 2
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i64 @load_breg_and_offreg_4(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_and_offreg_4
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = and i64 %a, 4294967295
+  %2 = shl i64 %1, 3
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+; Not all 'and' instructions have immediates.
+define i64 @load_breg_and_offreg_5(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: load_breg_and_offreg_5
+; CHECK:       and [[REG:x[0-9]+]], x0, x2
+; CHECK-NEXT:  ldr {{x[0-9]+}}, {{\[}}[[REG]], x1{{\]}}
+  %1 = and i64 %a, %c
+  %2 = add i64 %1, %b
+  %3 = inttoptr i64 %2 to i64*
+  %4 = load i64* %3
+  ret i64 %4
+}
+
+define i64 @load_breg_and_offreg_6(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: load_breg_and_offreg_6
+; CHECK:       and [[REG:x[0-9]+]], x0, x2
+; CHECK-NEXT:  ldr {{x[0-9]+}}, {{\[}}x1, [[REG]], lsl #3{{\]}}
+  %1 = and i64 %a, %c
+  %2 = shl i64 %1, 3
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+; Load Base Register + Scaled Register Offset + Sign/Zero extension
+define i32 @load_breg_zext_shift_offreg_1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_zext_shift_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, uxtw #2]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 2
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_zext_shift_offreg_2(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_zext_shift_offreg_2
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, uxtw #2]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 2
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_zext_mul_offreg_1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_zext_mul_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, uxtw #2]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 4
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_sext_shift_offreg_1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_shift_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, sxtw #2]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 2
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_sext_shift_offreg_2(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_shift_offreg_2
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, sxtw #2]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 2
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+; Make sure that we don't drop the first 'add' instruction.
+define i32 @load_breg_sext_shift_offreg_3(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_shift_offreg_3
+; CHECK:       add [[REG:w[0-9]+]], w0, #4
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}x1, [[REG]], sxtw #2{{\]}}
+  %1 = add i32 %a, 4
+  %2 = sext i32 %1 to i64
+  %3 = shl i64 %2, 2
+  %4 = add i64 %b, %3
+  %5 = inttoptr i64 %4 to i32*
+  %6 = load i32* %5
+  ret i32 %6
+}
+
+
+define i32 @load_breg_sext_mul_offreg_1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_mul_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, sxtw #2]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 4
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+; Load Scaled Register Offset + Immediate Offset + Sign/Zero extension
+define i64 @load_sext_shift_offreg_imm1(i32 %a) {
+; CHECK-LABEL: load_sext_shift_offreg_imm1
+; CHECK:       sbfiz [[REG:x[0-9]+]], {{x[0-9]+}}, #3, #32
+; CHECK-NEXT:  ldr {{x[0-9]+}}, {{\[}}[[REG]], #8{{\]}}
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %2, 8
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+; Load Base Register + Scaled Register Offset + Immediate Offset + Sign/Zero extension
+define i64 @load_breg_sext_shift_offreg_imm1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_shift_offreg_imm1
+; CHECK:       add [[REG:x[0-9]+]], x1, w0, sxtw #3
+; CHECK-NEXT:  ldr {{x[0-9]+}}, {{\[}}[[REG]], #8{{\]}}
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = add i64 %3, 8
+  %5 = inttoptr i64 %4 to i64*
+  %6 = load i64* %5
+  ret i64 %6
+}
+
+; Test that the kill flag is not set - the machine instruction verifier does that for us.
+define i64 @kill_reg(i64 %a) {
+  %1 = sub i64 %a, 8
+  %2 = add i64 %1, 96
+  %3 = inttoptr i64 %2 to i64*
+  %4 = load i64* %3
+  %5 = add i64 %2, %4
+  ret i64 %5
+}
+
+define void @store_fi(i64 %i) {
+; CHECK-LABEL: store_fi
+; CHECK:       mov [[REG:x[0-9]+]], sp
+; CHECK:       str {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+  %1 = alloca [8 x i32]
+  %2 = ptrtoint [8 x i32]* %1 to i64
+  %3 = mul i64 %i, 4
+  %4 = add i64 %2, %3
+  %5 = inttoptr i64 %4 to i32*
+  store i32 47, i32* %5, align 4
+  ret void
+}
+
+define i32 @load_fi(i64 %i) {
+; CHECK-LABEL: load_fi
+; CHECK:       mov [[REG:x[0-9]+]], sp
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+  %1 = alloca [8 x i32]
+  %2 = ptrtoint [8 x i32]* %1 to i64
+  %3 = mul i64 %i, 4
+  %4 = add i64 %2, %3
+  %5 = inttoptr i64 %4 to i32*
+  %6 = load i32* %5, align 4
+  ret i32 %6
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-branch_weights.ll b/test/CodeGen/AArch64/fast-isel-branch_weights.ll
new file mode 100644
index 0000000..5b22476
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-branch_weights.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=arm64-apple-darwin -aarch64-atomic-cfg-tidy=0                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -aarch64-atomic-cfg-tidy=0 -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; Test if the BBs are reordred according to their branch weights.
+define i64 @branch_weights_test(i64 %a, i64 %b) {
+; CHECK-LABEL: branch_weights_test
+; CHECK-LABEL: success
+; CHECK-LABEL: fail
+  %1 = icmp ult i64 %a, %b
+  br i1 %1, label %fail, label %success, !prof !0
+
+fail:
+  ret i64 -1
+
+success:
+  ret i64 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}
diff --git a/test/CodeGen/AArch64/fast-isel-call-return.ll b/test/CodeGen/AArch64/fast-isel-call-return.ll
new file mode 100644
index 0000000..9b10969
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-call-return.ll
@@ -0,0 +1,12 @@
+; RUN: llc -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnu"
+
+define i8* @test_call_return_type(i64 %size) {
+entry:
+; CHECK: bl xmalloc
+  %0 = call noalias i8* @xmalloc(i64 undef)
+  ret i8* %0
+}
+
+declare noalias i8* @xmalloc(i64)
diff --git a/test/CodeGen/AArch64/fast-isel-cbz.ll b/test/CodeGen/AArch64/fast-isel-cbz.ll
new file mode 100644
index 0000000..6e31a04
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-cbz.ll
@@ -0,0 +1,70 @@
+; RUN: llc -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+
+define i32 @icmp_eq_i1(i1 %a) {
+; CHECK-LABEL: icmp_eq_i1
+; CHECK:       tbz w0, #0, {{LBB.+_2}}
+  %1 = icmp eq i1 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i8(i8 %a) {
+; CHECK-LABEL: icmp_eq_i8
+; CHECK:       uxtb [[REG:w[0-9]+]], w0
+; CHECK:       cbz [[REG]], {{LBB.+_2}}
+  %1 = icmp eq i8 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i16(i16 %a) {
+; CHECK-LABEL: icmp_eq_i16
+; CHECK:       uxth [[REG:w[0-9]+]], w0
+; CHECK:       cbz [[REG]], {{LBB.+_2}}
+  %1 = icmp eq i16 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i32(i32 %a) {
+; CHECK-LABEL: icmp_eq_i32
+; CHECK:       cbz w0, {{LBB.+_2}}
+  %1 = icmp eq i32 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i64(i64 %a) {
+; CHECK-LABEL: icmp_eq_i64
+; CHECK:       cbz x0, {{LBB.+_2}}
+  %1 = icmp eq i64 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_ptr(i8* %a) {
+; CHECK-LABEL: icmp_eq_ptr
+; CHECK:       cbz x0, {{LBB.+_2}}
+  %1 = icmp eq i8* %a, null
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-cmp-branch.ll b/test/CodeGen/AArch64/fast-isel-cmp-branch.ll
new file mode 100644
index 0000000..3651f19
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-cmp-branch.ll
@@ -0,0 +1,293 @@
+; RUN: llc                             -aarch64-atomic-cfg-tidy=0 -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+; RUN: llc -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=0 -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+
+define i32 @fcmp_oeq(float %x, float %y) {
+; CHECK-LABEL: fcmp_oeq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.ne {{LBB.+_2}}
+  %1 = fcmp oeq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt(float %x, float %y) {
+; CHECK-LABEL: fcmp_ogt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.le {{LBB.+_2}}
+  %1 = fcmp ogt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge(float %x, float %y) {
+; CHECK-LABEL: fcmp_oge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.lt {{LBB.+_2}}
+  %1 = fcmp oge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt(float %x, float %y) {
+; CHECK-LABEL: fcmp_olt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.pl {{LBB.+_2}}
+  %1 = fcmp olt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole(float %x, float %y) {
+; CHECK-LABEL: fcmp_ole
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.hi {{LBB.+_2}}
+  %1 = fcmp ole float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one(float %x, float %y) {
+; CHECK-LABEL: fcmp_one
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.mi
+; CHECK-NEXT:  b.gt
+  %1 = fcmp one float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord(float %x, float %y) {
+; CHECK-LABEL: fcmp_ord
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.vs {{LBB.+_2}}
+  %1 = fcmp ord float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno(float %x, float %y) {
+; CHECK-LABEL: fcmp_uno
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.vs {{LBB.+_2}}
+  %1 = fcmp uno float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq(float %x, float %y) {
+; CHECK-LABEL: fcmp_ueq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.eq {{LBB.+_2}}
+; CHECK-NEXT:  b.vs {{LBB.+_2}}
+  %1 = fcmp ueq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt(float %x, float %y) {
+; CHECK-LABEL: fcmp_ugt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.ls {{LBB.+_2}}
+  %1 = fcmp ugt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge(float %x, float %y) {
+; CHECK-LABEL: fcmp_uge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.mi {{LBB.+_2}}
+  %1 = fcmp uge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult(float %x, float %y) {
+; CHECK-LABEL: fcmp_ult
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.ge {{LBB.+_2}}
+  %1 = fcmp ult float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule(float %x, float %y) {
+; CHECK-LABEL: fcmp_ule
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.gt {{LBB.+_2}}
+  %1 = fcmp ule float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une(float %x, float %y) {
+; CHECK-LABEL: fcmp_une
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.eq {{LBB.+_2}}
+  %1 = fcmp une float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_eq
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.ne {{LBB.+_2}}
+  %1 = icmp eq i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ne(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ne
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.eq {{LBB.+_2}}
+  %1 = icmp ne i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ugt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ugt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.ls {{LBB.+_2}}
+  %1 = icmp ugt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_uge(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_uge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.lo {{LBB.+_2}}
+  %1 = icmp uge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ult(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ult
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.hs {{LBB.+_2}}
+  %1 = icmp ult i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ule(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ule
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.hi {{LBB.+_2}}
+  %1 = icmp ule i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sgt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sgt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.le {{LBB.+_2}}
+  %1 = icmp sgt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sge(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.lt {{LBB.+_2}}
+  %1 = icmp sge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_slt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_slt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.ge {{LBB.+_2}}
+  %1 = icmp slt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sle(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sle
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.gt {{LBB.+_2}}
+  %1 = icmp sle i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-folding.ll b/test/CodeGen/AArch64/fast-isel-folding.ll
new file mode 100644
index 0000000..6b524ff
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-folding.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -O0 -fast-isel-abort -verify-machineinstrs < %s
+
+; Test that we don't fold the shift.
+define i64 @fold_shift_test(i64 %a, i1 %c) {
+  %1 = sub i64 %a, 8
+  %2 = ashr i64 %1, 3
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  %3 = icmp ult i64 0, %2
+  br i1 %3, label %bb2, label %bb3
+bb2:
+  ret i64 1
+bb3:
+  ret i64 2
+}
+
+; Test that we don't fold the sign-extend.
+define i64 @fold_sext_test1(i32 %a, i1 %c) {
+  %1 = sub i32 %a, 8
+  %2 = sext i32 %1 to i64
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  %3 = icmp ult i64 0, %2
+  br i1 %3, label %bb2, label %bb3
+bb2:
+  ret i64 1
+bb3:
+  ret i64 2
+}
+
+; Test that we don't fold the sign-extend.
+define i64 @fold_sext_test2(i32 %a, i1 %c) {
+  %1 = sub i32 %a, 8
+  %2 = sext i32 %1 to i64
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  %3 = shl i64 %2, 4
+  ret i64 %3
+bb2:
+  ret i64 %2
+}
+
+; Test that we clear the kill flag.
+define i32 @fold_kill_test(i32 %a) {
+  %1 = sub i32 %a, 8
+  %2 = shl i32 %1, 3
+  %3 = icmp ult i32 0, %2
+  br i1 %3, label %bb1, label %bb2
+bb1:
+  ret i32 %2
+bb2:
+  %4 = add i32 %2, 4
+  ret i32 %4
+}
diff --git a/test/CodeGen/AArch64/fast-isel-gep.ll b/test/CodeGen/AArch64/fast-isel-gep.ll
new file mode 100644
index 0000000..4dc0a05
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-gep.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+%struct.foo = type { i32, i64, float, double }
+
+define double* @test_struct(%struct.foo* %f) {
+; CHECK-LABEL: test_struct
+; CHECK:       add x0, x0, #24
+  %1 = getelementptr inbounds %struct.foo* %f, i64 0, i32 3
+  ret double* %1
+}
+
+define i32* @test_array1(i32* %a, i64 %i) {
+; CHECK-LABEL: test_array1
+; CHECK:       orr [[REG:x[0-9]+]], xzr, #0x4
+; CHECK-NEXT:  madd  x0, x1, [[REG]], x0
+  %1 = getelementptr inbounds i32* %a, i64 %i
+  ret i32* %1
+}
+
+define i32* @test_array2(i32* %a) {
+; CHECK-LABEL: test_array2
+; CHECK:       add  x0, x0, #16
+  %1 = getelementptr inbounds i32* %a, i64 4
+  ret i32* %1
+}
+
+define i32* @test_array3(i32* %a) {
+; CHECK-LABEL: test_array3
+; CHECK:       add x0, x0, #1, lsl #12
+  %1 = getelementptr inbounds i32* %a, i64 1024
+  ret i32* %1
+}
+
+define i32* @test_array4(i32* %a) {
+; CHECK-LABEL: test_array4
+; CHECK:       movz [[REG:x[0-9]+]], #0x1008
+; CHECK-NEXR:  add x0, x0, [[REG]]
+  %1 = getelementptr inbounds i32* %a, i64 1026
+  ret i32* %1
+}
+
+define i32* @test_array5(i32* %a, i32 %i) {
+; CHECK-LABEL: test_array5
+; CHECK:       sxtw [[REG1:x[0-9]+]], w1
+; CHECK-NEXT:  orr  [[REG2:x[0-9]+]], xzr, #0x4
+; CHECK-NEXT:  madd  {{x[0-9]+}}, [[REG1]], [[REG2]], x0
+  %1 = getelementptr inbounds i32* %a, i32 %i
+  ret i32* %1
+}
diff --git a/test/CodeGen/AArch64/fast-isel-int-ext.ll b/test/CodeGen/AArch64/fast-isel-int-ext.ll
new file mode 100644
index 0000000..866feba
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext.ll
@@ -0,0 +1,491 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+;
+; Test that we only use the sign/zero extend in the address calculation when
+; necessary.
+;
+; SHIFT
+;
+define i64 @load_addr_shift_zext1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_zext1
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_zext2(i32 zeroext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_zext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_zext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_zext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_sext1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_sext1
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_sext2(i32 zeroext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_sext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_sext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_sext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+;
+; MUL
+;
+define i64 @load_addr_mul_zext1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_zext1
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_zext2(i32 zeroext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_zext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_zext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_zext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_sext1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_sext1
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_sext2(i32 zeroext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_sext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_sext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_sext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+
+;
+; Test folding of the sign-/zero-extend into the load instruction.
+;
+
+; Unscaled
+define i32 @load_unscaled_zext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i32
+; CHECK:       ldurb w0, [x0, #-8]
+; CHECK-NOT:   uxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_zext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i32
+; CHECK:       ldurh w0, [x0, #-8]
+; CHECK-NOT:   uxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_zext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i64
+; CHECK:       ldurb w0, [x0, #-8]
+; CHECK-NOT:   uxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i64
+; CHECK:       ldurh w0, [x0, #-8]
+; CHECK-NOT:   uxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i32_to_i64
+; CHECK:       ldur w0, [x0, #-8]
+; CHECK-NOT:   uxtw
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_unscaled_sext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i32
+; CHECK:       ldursb w0, [x0, #-8]
+; CHECK-NOT:   sxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_sext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i32
+; CHECK:       ldursh w0, [x0, #-8]
+; CHECK-NOT:   sxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_sext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i64
+; CHECK:       ldursb x0, [x0, #-8]
+; CHECK-NOT:   sxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i64
+; CHECK:       ldursh x0, [x0, #-8]
+; CHECK-NOT:   sxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i32_to_i64
+; CHECK:       ldursw x0, [x0, #-8]
+; CHECK-NOT:   sxtw
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Register
+define i32 @load_register_zext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_zext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_zext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, x1]
+; CHECK-NOT:   uxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_register_sext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_sext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_sext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, x1]
+; CHECK-NOT:   sxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Extend
+define i32 @load_extend_zext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = zext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_zext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = zext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_zext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = zext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = zext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  %5 = zext i32 %4 to i64
+  ret i64 %5
+}
+
+define i32 @load_extend_sext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = sext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_sext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, w1, sxtw]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = sext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_sext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = sext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = sext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  %5 = sext i32 %4 to i64
+  ret i64 %5
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-int-ext2.ll b/test/CodeGen/AArch64/fast-isel-int-ext2.ll
new file mode 100644
index 0000000..8df26b2
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext2.ll
@@ -0,0 +1,439 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=false -disable-cgp-branch-opts -verify-machineinstrs < %s | FileCheck %s
+
+;
+; Test folding of the sign-/zero-extend into the load instruction.
+;
+
+; Unscaled
+define i32 @load_unscaled_zext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i32
+; CHECK:       ldurb w0, [x0, #-8]
+; CHECK-NOT:   uxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_zext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i32
+; CHECK:       ldurh w0, [x0, #-8]
+; CHECK-NOT:   uxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_zext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i64
+; CHECK:       ldurb w0, [x0, #-8]
+; CHECK-NOT:   uxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i64
+; CHECK:       ldurh w0, [x0, #-8]
+; CHECK-NOT:   uxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i32_to_i64
+; CHECK:       ldur w0, [x0, #-8]
+; CHECK-NOT:   uxtw
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_unscaled_sext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i32
+; CHECK:       ldursb w0, [x0, #-8]
+; CHECK-NOT:   sxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_sext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i32
+; CHECK:       ldursh w0, [x0, #-8]
+; CHECK-NOT:   sxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_sext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i64
+; CHECK:       ldursb x0, [x0, #-8]
+; CHECK-NOT:   sxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i64
+; CHECK:       ldursh x0, [x0, #-8]
+; CHECK-NOT:   sxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i32_to_i64
+; CHECK:       ldursw x0, [x0, #-8]
+; CHECK-NOT:   sxtw
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Register
+define i32 @load_register_zext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_zext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_zext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, x1]
+; CHECK-NOT:   uxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_register_sext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_sext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_sext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, x1]
+; CHECK-NOT:   sxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Extend
+define i32 @load_extend_zext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_zext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_zext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i32 %4 to i64
+  ret i64 %5
+}
+
+define i32 @load_extend_sext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_sext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, w1, sxtw]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_sext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i32 %4 to i64
+  ret i64 %5
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-int-ext3.ll b/test/CodeGen/AArch64/fast-isel-int-ext3.ll
new file mode 100644
index 0000000..5d55a6b
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext3.ll
@@ -0,0 +1,117 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s
+
+;
+; Test folding of the sign-/zero-extend into the load instruction.
+;
+
+; Unscaled
+define i32 @load_unscaled_zext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i32
+; CHECK:       ldurb [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       uxtb w0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8 addrspace(256)*
+  %3 = load i8 addrspace(256)* %2
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_zext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i32
+; CHECK:       ldurh [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       uxth w0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16 addrspace(256)*
+  %3 = load i16 addrspace(256)* %2
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_zext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i64
+; CHECK:       ldurb w[[REG:[0-9]+]], [x0, #-8]
+; CHECK:       ubfx x0, x[[REG]], #0, #8
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8 addrspace(256)*
+  %3 = load i8 addrspace(256)* %2
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i64
+; CHECK:       ldurh w[[REG:[0-9]+]], [x0, #-8]
+; CHECK:       ubfx x0, x[[REG]], #0, #16
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16 addrspace(256)*
+  %3 = load i16 addrspace(256)* %2
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i32_to_i64
+; CHECK:       ldur w[[REG:[0-9]+]], [x0, #-8]
+; CHECK:       ubfx x0, x[[REG]], #0, #32
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32 addrspace(256)*
+  %3 = load i32 addrspace(256)* %2
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_unscaled_sext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i32
+; CHECK:       ldurb [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxtb w0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8 addrspace(256)*
+  %3 = load i8 addrspace(256)* %2
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_sext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i32
+; CHECK:       ldurh [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxth w0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16 addrspace(256)*
+  %3 = load i16 addrspace(256)* %2
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_sext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i64
+; CHECK:       ldurb [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxtb x0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8 addrspace(256)*
+  %3 = load i8 addrspace(256)* %2
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i64
+; CHECK:       ldurh [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxth x0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16 addrspace(256)*
+  %3 = load i16 addrspace(256)* %2
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i32_to_i64
+; CHECK:       ldur [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxtw x0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32 addrspace(256)*
+  %3 = load i32 addrspace(256)* %2
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-int-ext4.ll b/test/CodeGen/AArch64/fast-isel-int-ext4.ll
new file mode 100644
index 0000000..f25bb98
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext4.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+define i32 @kill_flag(i16 signext %a) {
+; CHECK-LABEL: kill_flag
+entry:
+  %0 = sext i16 %a to i32
+  br label %bb1
+
+bb1:
+  %1 = icmp slt i32 undef, %0
+  br i1 %1, label %loop, label %exit
+
+loop:
+  %2 = sext i16 %a to i32
+  %3 = icmp slt i32 undef, %2
+  br i1 %3, label %bb1, label %exit
+
+exit:
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/fast-isel-intrinsic.ll b/test/CodeGen/AArch64/fast-isel-intrinsic.ll
new file mode 100644
index 0000000..fd1198a
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-intrinsic.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=aarch64-apple-darwin            -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s
+
+define float @fabs_f32(float %a) {
+; CHECK-LABEL: fabs_f32
+; CHECK:       fabs s0, s0
+  %1 = call float @llvm.fabs.f32(float %a)
+  ret float %1
+}
+
+define double @fabs_f64(double %a) {
+; CHECK-LABEL: fabs_f64
+; CHECK:       fabs d0, d0
+  %1 = call double @llvm.fabs.f64(double %a)
+  ret double %1
+}
+
+declare double @llvm.fabs.f64(double)
+declare float @llvm.fabs.f32(float)
diff --git a/test/CodeGen/AArch64/fast-isel-logic-op.ll b/test/CodeGen/AArch64/fast-isel-logic-op.ll
new file mode 100644
index 0000000..2c7486e
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-logic-op.ll
@@ -0,0 +1,362 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=0                  -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=1 -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; AND
+define zeroext i1 @and_rr_i1(i1 signext %a, i1 signext %b) {
+; CHECK-LABEL: and_rr_i1
+; CHECK:       and [[REG:w[0-9]+]], w0, w1
+  %1 = and i1 %a, %b
+  ret i1 %1
+}
+
+define zeroext i8 @and_rr_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: and_rr_i8
+; CHECK:       and [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = and i8 %a, %b
+  ret i8 %1
+}
+
+define zeroext i16 @and_rr_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: and_rr_i16
+; CHECK:       and [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = and i16 %a, %b
+  ret i16 %1
+}
+
+define i32 @and_rr_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: and_rr_i32
+; CHECK:       and w0, w0, w1
+  %1 = and i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @and_rr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: and_rr_i64
+; CHECK:       and x0, x0, x1
+  %1 = and i64 %a, %b
+  ret i64 %1
+}
+
+define zeroext i1 @and_ri_i1(i1 signext %a) {
+; CHECK-LABEL: and_ri_i1
+; CHECK:       and {{w[0-9]+}}, w0, #0x1
+  %1 = and i1 %a, 1
+  ret i1 %1
+}
+
+define zeroext i8 @and_ri_i8(i8 signext %a) {
+; CHECK-LABEL: and_ri_i8
+; CHECK:       and {{w[0-9]+}}, w0, #0xf
+  %1 = and i8 %a, 15
+  ret i8 %1
+}
+
+define zeroext i16 @and_ri_i16(i16 signext %a) {
+; CHECK-LABEL: and_ri_i16
+; CHECK:       and {{w[0-9]+}}, w0, #0xff
+  %1 = and i16 %a, 255
+  ret i16 %1
+}
+
+define i32 @and_ri_i32(i32 %a) {
+; CHECK-LABEL: and_ri_i32
+; CHECK:       and w0, w0, #0xff
+  %1 = and i32 %a, 255
+  ret i32 %1
+}
+
+define i64 @and_ri_i64(i64 %a) {
+; CHECK-LABEL: and_ri_i64
+; CHECK:       and x0, x0, #0xff
+  %1 = and i64 %a, 255
+  ret i64 %1
+}
+
+define zeroext i8 @and_rs_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: and_rs_i8
+; CHECK:       and [[REG:w[0-9]+]], w0, w1, lsl #4
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xff|#0xf0}}
+  %1 = shl i8 %b, 4
+  %2 = and i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @and_rs_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: and_rs_i16
+; CHECK:       and [[REG:w[0-9]+]], w0, w1, lsl #8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xffff|#0xff00}}
+  %1 = shl i16 %b, 8
+  %2 = and i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @and_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: and_rs_i32
+; CHECK:       and w0, w0, w1, lsl #8
+  %1 = shl i32 %b, 8
+  %2 = and i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @and_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: and_rs_i64
+; CHECK:       and x0, x0, x1, lsl #8
+  %1 = shl i64 %b, 8
+  %2 = and i64 %a, %1
+  ret i64 %2
+}
+
+define i32 @and_mul_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: and_mul_i32
+; CHECK:       and w0, w0, w1, lsl #2
+  %1 = mul i32 %b, 4
+  %2 = and i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @and_mul_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: and_mul_i64
+; CHECK:       and x0, x0, x1, lsl #2
+  %1 = mul i64 %b, 4
+  %2 = and i64 %a, %1
+  ret i64 %2
+}
+
+; OR
+define zeroext i1 @or_rr_i1(i1 signext %a, i1 signext %b) {
+; CHECK-LABEL: or_rr_i1
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1
+  %1 = or i1 %a, %b
+  ret i1 %1
+}
+
+define zeroext i8 @or_rr_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: or_rr_i8
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = or i8 %a, %b
+  ret i8 %1
+}
+
+define zeroext i16 @or_rr_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: or_rr_i16
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = or i16 %a, %b
+  ret i16 %1
+}
+
+define i32 @or_rr_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: or_rr_i32
+; CHECK:       orr w0, w0, w1
+  %1 = or i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @or_rr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: or_rr_i64
+; CHECK:       orr x0, x0, x1
+  %1 = or i64 %a, %b
+  ret i64 %1
+}
+
+define zeroext i8 @or_ri_i8(i8 %a) {
+; CHECK-LABEL: or_ri_i8
+; CHECK:       orr [[REG:w[0-9]+]], w0, #0xf
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = or i8 %a, 15
+  ret i8 %1
+}
+
+define zeroext i16 @or_ri_i16(i16 %a) {
+; CHECK-LABEL: or_ri_i16
+; CHECK:       orr [[REG:w[0-9]+]], w0, #0xff
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = or i16 %a, 255
+  ret i16 %1
+}
+
+define i32 @or_ri_i32(i32 %a) {
+; CHECK-LABEL: or_ri_i32
+; CHECK:       orr w0, w0, #0xff
+  %1 = or i32 %a, 255
+  ret i32 %1
+}
+
+define i64 @or_ri_i64(i64 %a) {
+; CHECK-LABEL: or_ri_i64
+; CHECK:       orr x0, x0, #0xff
+  %1 = or i64 %a, 255
+  ret i64 %1
+}
+
+define zeroext i8 @or_rs_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: or_rs_i8
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1, lsl #4
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xff|#0xf0}}
+  %1 = shl i8 %b, 4
+  %2 = or i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @or_rs_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: or_rs_i16
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1, lsl #8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xffff|#0xff00}}
+  %1 = shl i16 %b, 8
+  %2 = or i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @or_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: or_rs_i32
+; CHECK:       orr w0, w0, w1, lsl #8
+  %1 = shl i32 %b, 8
+  %2 = or i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @or_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: or_rs_i64
+; CHECK:       orr x0, x0, x1, lsl #8
+  %1 = shl i64 %b, 8
+  %2 = or i64 %a, %1
+  ret i64 %2
+}
+
+define i32 @or_mul_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: or_mul_i32
+; CHECK:       orr w0, w0, w1, lsl #2
+  %1 = mul i32 %b, 4
+  %2 = or i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @or_mul_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: or_mul_i64
+; CHECK:       orr x0, x0, x1, lsl #2
+  %1 = mul i64 %b, 4
+  %2 = or i64 %a, %1
+  ret i64 %2
+}
+
+; XOR
+define zeroext i1 @xor_rr_i1(i1 signext %a, i1 signext %b) {
+; CHECK-LABEL: xor_rr_i1
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1
+  %1 = xor i1 %a, %b
+  ret i1 %1
+}
+
+define zeroext i8 @xor_rr_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: xor_rr_i8
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = xor i8 %a, %b
+  ret i8 %1
+}
+
+define zeroext i16 @xor_rr_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: xor_rr_i16
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = xor i16 %a, %b
+  ret i16 %1
+}
+
+define i32 @xor_rr_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: xor_rr_i32
+; CHECK:       eor w0, w0, w1
+  %1 = xor i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @xor_rr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: xor_rr_i64
+; CHECK:       eor x0, x0, x1
+  %1 = xor i64 %a, %b
+  ret i64 %1
+}
+
+define zeroext i8 @xor_ri_i8(i8 signext %a) {
+; CHECK-LABEL: xor_ri_i8
+; CHECK:       eor [[REG:w[0-9]+]], w0, #0xf
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = xor i8 %a, 15
+  ret i8 %1
+}
+
+define zeroext i16 @xor_ri_i16(i16 signext %a) {
+; CHECK-LABEL: xor_ri_i16
+; CHECK:       eor [[REG:w[0-9]+]], w0, #0xff
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = xor i16 %a, 255
+  ret i16 %1
+}
+
+define i32 @xor_ri_i32(i32 %a) {
+; CHECK-LABEL: xor_ri_i32
+; CHECK:       eor w0, w0, #0xff
+  %1 = xor i32 %a, 255
+  ret i32 %1
+}
+
+define i64 @xor_ri_i64(i64 %a) {
+; CHECK-LABEL: xor_ri_i64
+; CHECK:       eor x0, x0, #0xff
+  %1 = xor i64 %a, 255
+  ret i64 %1
+}
+
+define zeroext i8 @xor_rs_i8(i8 %a, i8 %b) {
+; CHECK-LABEL: xor_rs_i8
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1, lsl #4
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xff|#0xf0}}
+  %1 = shl i8 %b, 4
+  %2 = xor i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @xor_rs_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: xor_rs_i16
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1, lsl #8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xffff|#0xff00}}
+  %1 = shl i16 %b, 8
+  %2 = xor i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @xor_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: xor_rs_i32
+; CHECK:       eor w0, w0, w1, lsl #8
+  %1 = shl i32 %b, 8
+  %2 = xor i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @xor_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: xor_rs_i64
+; CHECK:       eor x0, x0, x1, lsl #8
+  %1 = shl i64 %b, 8
+  %2 = xor i64 %a, %1
+  ret i64 %2
+}
+
+define i32 @xor_mul_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: xor_mul_i32
+; CHECK:       eor w0, w0, w1, lsl #2
+  %1 = mul i32 %b, 4
+  %2 = xor i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @xor_mul_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: xor_mul_i64
+; CHECK:       eor x0, x0, x1, lsl #2
+  %1 = mul i64 %b, 4
+  %2 = xor i64 %a, %1
+  ret i64 %2
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-mul.ll b/test/CodeGen/AArch64/fast-isel-mul.ll
index d02c67f..f2fda27 100644
--- a/test/CodeGen/AArch64/fast-isel-mul.ll
+++ b/test/CodeGen/AArch64/fast-isel-mul.ll
@@ -1,40 +1,44 @@
-; RUN: llc -fast-isel -fast-isel-abort -mtriple=aarch64 -o - %s | FileCheck %s
+; RUN: llc -fast-isel -fast-isel-abort -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
 
-@var8 = global i8 0
-@var16 = global i16 0
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @test_mul8(i8 %lhs, i8 %rhs) {
+define zeroext i8 @test_mul8(i8 %lhs, i8 %rhs) {
 ; CHECK-LABEL: test_mul8:
-; CHECK: mul w0, w0, w1
-;  %lhs = load i8* @var8
-;  %rhs = load i8* @var8
-  %prod = mul i8 %lhs, %rhs
-  store i8 %prod, i8* @var8
-  ret void
+; CHECK:       mul {{w[0-9]+}}, w0, w1
+  %1 = mul i8 %lhs, %rhs
+  ret i8 %1
 }
 
-define void @test_mul16(i16 %lhs, i16 %rhs) {
+define zeroext i16 @test_mul16(i16 %lhs, i16 %rhs) {
 ; CHECK-LABEL: test_mul16:
-; CHECK: mul w0, w0, w1
-  %prod = mul i16 %lhs, %rhs
-  store i16 %prod, i16* @var16
-  ret void
+; CHECK:       mul {{w[0-9]+}}, w0, w1
+  %1 = mul i16 %lhs, %rhs
+  ret i16 %1
 }
 
-define void @test_mul32(i32 %lhs, i32 %rhs) {
+define i32 @test_mul32(i32 %lhs, i32 %rhs) {
 ; CHECK-LABEL: test_mul32:
-; CHECK: mul w0, w0, w1
-  %prod = mul i32 %lhs, %rhs
-  store i32 %prod, i32* @var32
-  ret void
+; CHECK:       mul {{w[0-9]+}}, w0, w1
+  %1 = mul i32 %lhs, %rhs
+  ret i32 %1
 }
 
-define void @test_mul64(i64 %lhs, i64 %rhs) {
+define i64 @test_mul64(i64 %lhs, i64 %rhs) {
 ; CHECK-LABEL: test_mul64:
-; CHECK: mul x0, x0, x1
-  %prod = mul i64 %lhs, %rhs
-  store i64 %prod, i64* @var64
-  ret void
+; CHECK:       mul {{x[0-9]+}}, x0, x1
+  %1 = mul i64 %lhs, %rhs
+  ret i64 %1
+}
+
+define i32 @test_mul2shift_i32(i32 %a) {
+; CHECK-LABEL: test_mul2shift_i32:
+; CHECK:       lsl {{w[0-9]+}}, w0, #2
+  %1 = mul i32 %a, 4
+  ret i32 %1
 }
+
+define i64 @test_mul2shift_i64(i64 %a) {
+; CHECK-LABEL: test_mul2shift_i64:
+; CHECK:       lsl {{x[0-9]+}}, x0, #3
+  %1 = mul i64 %a, 8
+  ret i64 %1
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll b/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll
new file mode 100644
index 0000000..8d2d39a
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll
@@ -0,0 +1,96 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -code-model=small -verify-machineinstrs < %s | FileCheck %s --check-prefix=SMALL
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -code-model=large -verify-machineinstrs < %s | FileCheck %s --check-prefix=LARGE
+
+define float @frem_f32(float %a, float %b) {
+; SMALL-LABEL: frem_f32
+; SMALL:       bl _fmodf
+; LARGE-LABEL: frem_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _fmodf@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _fmodf@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = frem float %a, %b
+  ret float %1
+}
+
+define double @frem_f64(double %a, double %b) {
+; SMALL-LABEL: frem_f64
+; SMALL:       bl _fmod
+; LARGE-LABEL: frem_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _fmod@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _fmod@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = frem double %a, %b
+  ret double %1
+}
+
+define float @sin_f32(float %a) {
+; SMALL-LABEL: sin_f32
+; SMALL:       bl _sinf
+; LARGE-LABEL: sin_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _sinf@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _sinf@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call float @llvm.sin.f32(float %a)
+  ret float %1
+}
+
+define double @sin_f64(double %a) {
+; SMALL-LABEL: sin_f64
+; SMALL:       bl _sin
+; LARGE-LABEL: sin_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _sin@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _sin@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call double @llvm.sin.f64(double %a)
+  ret double %1
+}
+
+define float @cos_f32(float %a) {
+; SMALL-LABEL: cos_f32
+; SMALL:       bl _cosf
+; LARGE-LABEL: cos_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _cosf@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _cosf@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call float @llvm.cos.f32(float %a)
+  ret float %1
+}
+
+define double @cos_f64(double %a) {
+; SMALL-LABEL: cos_f64
+; SMALL:       bl _cos
+; LARGE-LABEL: cos_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _cos@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _cos@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call double @llvm.cos.f64(double %a)
+  ret double %1
+}
+
+define float @pow_f32(float %a, float %b) {
+; SMALL-LABEL: pow_f32
+; SMALL:       bl _powf
+; LARGE-LABEL: pow_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _powf@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _powf@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call float @llvm.pow.f32(float %a, float %b)
+  ret float %1
+}
+
+define double @pow_f64(double %a, double %b) {
+; SMALL-LABEL: pow_f64
+; SMALL:       bl _pow
+; LARGE-LABEL: pow_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _pow@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _pow@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call double @llvm.pow.f64(double %a, double %b)
+  ret double %1
+}
+declare float @llvm.sin.f32(float)
+declare double @llvm.sin.f64(double)
+declare float @llvm.cos.f32(float)
+declare double @llvm.cos.f64(double)
+declare float @llvm.pow.f32(float, float)
+declare double @llvm.pow.f64(double, double)
diff --git a/test/CodeGen/AArch64/fast-isel-sdiv.ll b/test/CodeGen/AArch64/fast-isel-sdiv.ll
new file mode 100644
index 0000000..3080776
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-sdiv.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+define i32 @sdiv_i32_exact(i32 %a) {
+; CHECK-LABEL: sdiv_i32_exact
+; CHECK:       asr {{w[0-9]+}}, w0, #3
+  %1 = sdiv exact i32 %a, 8
+  ret i32 %1
+}
+
+define i32 @sdiv_i32_pos(i32 %a) {
+; CHECK-LABEL: sdiv_i32_pos
+; CHECK:       add [[REG1:w[0-9]+]], w0, #7
+; CHECK-NEXT:  cmp w0, #0
+; CHECK-NEXT:  csel [[REG2:w[0-9]+]], [[REG1]], w0, lt
+; CHECK-NEXT:  asr {{w[0-9]+}}, [[REG2]], #3
+  %1 = sdiv i32 %a, 8
+  ret i32 %1
+}
+
+define i32 @sdiv_i32_neg(i32 %a) {
+; CHECK-LABEL: sdiv_i32_neg
+; CHECK:       add [[REG1:w[0-9]+]], w0, #7
+; CHECK-NEXT:  cmp w0, #0
+; CHECK-NEXT:  csel [[REG2:w[0-9]+]], [[REG1]], w0, lt
+; CHECK-NEXT:  neg {{w[0-9]+}}, [[REG2]], asr #3
+  %1 = sdiv i32 %a, -8
+  ret i32 %1
+}
+
+define i64 @sdiv_i64_exact(i64 %a) {
+; CHECK-LABEL: sdiv_i64_exact
+; CHECK:       asr {{x[0-9]+}}, x0, #4
+  %1 = sdiv exact i64 %a, 16
+  ret i64 %1
+}
+
+define i64 @sdiv_i64_pos(i64 %a) {
+; CHECK-LABEL: sdiv_i64_pos
+; CHECK:       add [[REG1:x[0-9]+]], x0, #15
+; CHECK-NEXT:  cmp x0, #0
+; CHECK-NEXT:  csel [[REG2:x[0-9]+]], [[REG1]], x0, lt
+; CHECK-NEXT:  asr {{x[0-9]+}}, [[REG2]], #4
+  %1 = sdiv i64 %a, 16
+  ret i64 %1
+}
+
+define i64 @sdiv_i64_neg(i64 %a) {
+; CHECK-LABEL: sdiv_i64_neg
+; CHECK:       add [[REG1:x[0-9]+]], x0, #15
+; CHECK-NEXT:  cmp x0, #0
+; CHECK-NEXT:  csel [[REG2:x[0-9]+]], [[REG1]], x0, lt
+; CHECK-NEXT:  neg {{x[0-9]+}}, [[REG2]], asr #4
+  %1 = sdiv i64 %a, -16
+  ret i64 %1
+}
diff --git a/test/CodeGen/AArch64/fast-isel-select.ll b/test/CodeGen/AArch64/fast-isel-select.ll
new file mode 100644
index 0000000..928e9d4
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-select.ll
@@ -0,0 +1,316 @@
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; First test the different supported value types for select.
+define zeroext i1 @select_i1(i1 zeroext %c, i1 zeroext %a, i1 zeroext %b) {
+; CHECK-LABEL: select_i1
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+  %1 = select i1 %c, i1 %a, i1 %b
+  ret i1 %1
+}
+
+define zeroext i8 @select_i8(i1 zeroext %c, i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: select_i8
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+  %1 = select i1 %c, i8 %a, i8 %b
+  ret i8 %1
+}
+
+define zeroext i16 @select_i16(i1 zeroext %c, i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: select_i16
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+  %1 = select i1 %c, i16 %a, i16 %b
+  ret i16 %1
+}
+
+define i32 @select_i32(i1 zeroext %c, i32 %a, i32 %b) {
+; CHECK-LABEL: select_i32
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+  %1 = select i1 %c, i32 %a, i32 %b
+  ret i32 %1
+}
+
+define i64 @select_i64(i1 zeroext %c, i64 %a, i64 %b) {
+; CHECK-LABEL: select_i64
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{x[0-9]+}}, x1, x2, ne
+  %1 = select i1 %c, i64 %a, i64 %b
+  ret i64 %1
+}
+
+define float @select_f32(i1 zeroext %c, float %a, float %b) {
+; CHECK-LABEL: select_f32
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ne
+  %1 = select i1 %c, float %a, float %b
+  ret float %1
+}
+
+define double @select_f64(i1 zeroext %c, double %a, double %b) {
+; CHECK-LABEL: select_f64
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  fcsel {{d[0-9]+}}, d0, d1, ne
+  %1 = select i1 %c, double %a, double %b
+  ret double %1
+}
+
+; Now test the folding of all compares.
+define float @select_fcmp_false(float %x, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_false
+; CHECK:       mov.16b {{v[0-9]+}}, v2
+  %1 = fcmp ogt float %x, %x
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ogt(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ogt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, gt
+  %1 = fcmp ogt float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_oge(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_oge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, ge
+  %1 = fcmp oge float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_olt(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_olt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, mi
+  %1 = fcmp olt float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ole(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ole
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, ls
+  %1 = fcmp ole float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_one(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_one
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel [[REG:s[0-9]+]], s2, s3, mi
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, [[REG]], gt
+  %1 = fcmp one float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ord(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ord
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, vc
+  %1 = fcmp ord float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_uno(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_uno
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, vs
+  %1 = fcmp uno float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ueq(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ueq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel [[REG:s[0-9]+]], s2, s3, eq
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, [[REG]], vs
+  %1 = fcmp ueq float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ugt(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ugt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, hi
+  %1 = fcmp ugt float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_uge(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_uge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, pl
+  %1 = fcmp uge float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ult(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ult
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, lt
+  %1 = fcmp ult float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+
+define float @select_fcmp_ule(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ule
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, le
+  %1 = fcmp ule float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_une(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_une
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, ne
+  %1 = fcmp une float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_true(float %x, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_true
+; CHECK:       mov.16b {{v[0-9]+}}, v1
+  %1 = fcmp ueq float %x, %x
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_eq(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_eq
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, eq
+  %1 = icmp eq i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_ne(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_ne
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ne
+  %1 = icmp ne i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_ugt(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_ugt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, hi
+  %1 = icmp ugt i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_uge(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_uge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, hs
+  %1 = icmp uge i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_ult(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_ult
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, lo
+  %1 = icmp ult i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_ule(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_ule
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ls
+  %1 = icmp ule i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_sgt(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_sgt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, gt
+  %1 = icmp sgt i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_sge(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_sge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ge
+  %1 = icmp sge i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_slt(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_slt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, lt
+  %1 = icmp slt i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_sle
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, le
+  %1 = icmp sle i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+; Test peephole optimizations for select.
+define zeroext i1 @select_opt1(i1 zeroext %c, i1 zeroext %a) {
+; CHECK-LABEL: select_opt1
+; CHECK:       orr {{w[0-9]+}}, w0, w1
+  %1 = select i1 %c, i1 true, i1 %a
+  ret i1 %1
+}
+
+define zeroext i1 @select_opt2(i1 zeroext %c, i1 zeroext %a) {
+; CHECK-LABEL: select_opt2
+; CHECK:       eor [[REG:w[0-9]+]], w0, #0x1
+; CHECK:       orr {{w[0-9]+}}, [[REG]], w1
+  %1 = select i1 %c, i1 %a, i1 true
+  ret i1 %1
+}
+
+define zeroext i1 @select_opt3(i1 zeroext %c, i1 zeroext %a) {
+; CHECK-LABEL: select_opt3
+; CHECK:       bic {{w[0-9]+}}, w1, w0
+  %1 = select i1 %c, i1 false, i1 %a
+  ret i1 %1
+}
+
+define zeroext i1 @select_opt4(i1 zeroext %c, i1 zeroext %a) {
+; CHECK-LABEL: select_opt4
+; CHECK:       and {{w[0-9]+}}, w0, w1
+  %1 = select i1 %c, i1 %a, i1 false
+  ret i1 %1
+}
diff --git a/test/CodeGen/AArch64/fast-isel-shift.ll b/test/CodeGen/AArch64/fast-isel-shift.ll
new file mode 100644
index 0000000..ce4ba49
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-shift.ll
@@ -0,0 +1,545 @@
+; RUN: llc -fast-isel -fast-isel-abort -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: asr_zext_i1_i16
+; CHECK:       uxth {{w[0-9]*}}, wzr
+define zeroext i16 @asr_zext_i1_i16(i1 %b) {
+  %1 = zext i1 %b to i16
+  %2 = ashr i16 %1, 1
+  ret i16 %2
+}
+
+; CHECK-LABEL: asr_sext_i1_i16
+; CHECK:       sbfx [[REG1:w[0-9]+]], {{w[0-9]*}}, #0, #1
+; CHECK-NEXT:  sxth {{w[0-9]*}}, [[REG1]]
+define signext i16 @asr_sext_i1_i16(i1 %b) {
+  %1 = sext i1 %b to i16
+  %2 = ashr i16 %1, 1
+  ret i16 %2
+}
+
+; CHECK-LABEL: asr_zext_i1_i32
+; CHECK:       mov {{w[0-9]*}}, wzr
+define i32 @asr_zext_i1_i32(i1 %b) {
+  %1 = zext i1 %b to i32
+  %2 = ashr i32 %1, 1
+  ret i32 %2
+}
+
+; CHECK-LABEL: asr_sext_i1_i32
+; CHECK:       sbfx  {{w[0-9]*}}, {{w[0-9]*}}, #0, #1
+define i32 @asr_sext_i1_i32(i1 %b) {
+  %1 = sext i1 %b to i32
+  %2 = ashr i32 %1, 1
+  ret i32 %2
+}
+
+; CHECK-LABEL: asr_zext_i1_i64
+; CHECK:       mov {{x[0-9]*}}, xzr
+define i64 @asr_zext_i1_i64(i1 %b) {
+  %1 = zext i1 %b to i64
+  %2 = ashr i64 %1, 1
+  ret i64 %2
+}
+
+; CHECK-LABEL: asr_sext_i1_i64
+; CHECK:       sbfx {{x[0-9]*}}, {{x[0-9]*}}, #0, #1
+define i64 @asr_sext_i1_i64(i1 %b) {
+  %1 = sext i1 %b to i64
+  %2 = ashr i64 %1, 1
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsr_zext_i1_i16
+; CHECK:       uxth {{w[0-9]*}}, wzr
+define zeroext i16 @lsr_zext_i1_i16(i1 %b) {
+  %1 = zext i1 %b to i16
+  %2 = lshr i16 %1, 1
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsr_sext_i1_i16
+; CHECK:       sbfx [[REG1:w[0-9]+]], {{w[0-9]*}}, #0, #1
+; CHECK-NEXT:  ubfx [[REG2:w[0-9]+]], [[REG1]], #1, #15
+; CHECK-NEXT:  sxth {{w[0-9]*}}, [[REG2]]
+define signext i16 @lsr_sext_i1_i16(i1 %b) {
+  %1 = sext i1 %b to i16
+  %2 = lshr i16 %1, 1
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsr_zext_i1_i32
+; CHECK:       mov {{w[0-9]*}}, wzr
+define i32 @lsr_zext_i1_i32(i1 %b) {
+  %1 = zext i1 %b to i32
+  %2 = lshr i32 %1, 1
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsr_sext_i1_i32
+; CHECK:       sbfx [[REG1:w[0-9]+]], {{w[0-9]*}}, #0, #1
+; CHECK-NEXT:  lsr {{w[0-9]*}}, [[REG1:w[0-9]+]], #1
+define i32 @lsr_sext_i1_i32(i1 %b) {
+  %1 = sext i1 %b to i32
+  %2 = lshr i32 %1, 1
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsr_zext_i1_i64
+; CHECK:       mov {{x[0-9]*}}, xzr
+define i64 @lsr_zext_i1_i64(i1 %b) {
+  %1 = zext i1 %b to i64
+  %2 = lshr i64 %1, 1
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_zext_i1_i16
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #1
+define zeroext i16 @lsl_zext_i1_i16(i1 %b) {
+  %1 = zext i1 %b to i16
+  %2 = shl i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsl_sext_i1_i16
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #1
+define signext i16 @lsl_sext_i1_i16(i1 %b) {
+  %1 = sext i1 %b to i16
+  %2 = shl i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsl_zext_i1_i32
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #1
+define i32 @lsl_zext_i1_i32(i1 %b) {
+  %1 = zext i1 %b to i32
+  %2 = shl i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_sext_i1_i32
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #1
+define i32 @lsl_sext_i1_i32(i1 %b) {
+  %1 = sext i1 %b to i32
+  %2 = shl i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_zext_i1_i64
+; CHECK:       ubfiz {{x[0-9]*}}, {{x[0-9]*}}, #4, #1
+define i64 @lsl_zext_i1_i64(i1 %b) {
+  %1 = zext i1 %b to i64
+  %2 = shl i64 %1, 4
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_sext_i1_i64
+; CHECK:       sbfiz {{x[0-9]*}}, {{x[0-9]*}}, #4, #1
+define i64 @lsl_sext_i1_i64(i1 %b) {
+  %1 = sext i1 %b to i64
+  %2 = shl i64 %1, 4
+  ret i64 %2
+}
+
+; CHECK-LABEL: lslv_i8
+; CHECK:       and [[REG1:w[0-9]+]], w1, #0xff
+; CHECK-NEXT:  lsl [[REG2:w[0-9]+]], w0, [[REG1]]
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG2]], #0xff
+define zeroext i8 @lslv_i8(i8 %a, i8 %b) {
+  %1 = shl i8 %a, %b
+  ret i8 %1
+}
+
+; CHECK-LABEL: lsl_i8
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i8 @lsl_i8(i8 %a) {
+  %1 = shl i8 %a, 4
+  ret i8 %1
+}
+
+; CHECK-LABEL: lsl_zext_i8_i16
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #8
+define zeroext i16 @lsl_zext_i8_i16(i8 %b) {
+  %1 = zext i8 %b to i16
+  %2 = shl i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsl_sext_i8_i16
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #8
+define signext i16 @lsl_sext_i8_i16(i8 %b) {
+  %1 = sext i8 %b to i16
+  %2 = shl i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsl_zext_i8_i32
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #8
+define i32 @lsl_zext_i8_i32(i8 %b) {
+  %1 = zext i8 %b to i32
+  %2 = shl i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_sext_i8_i32
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #8
+define i32 @lsl_sext_i8_i32(i8 %b) {
+  %1 = sext i8 %b to i32
+  %2 = shl i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_zext_i8_i64
+; CHECK:       ubfiz {{x[0-9]*}}, {{x[0-9]*}}, #4, #8
+define i64 @lsl_zext_i8_i64(i8 %b) {
+  %1 = zext i8 %b to i64
+  %2 = shl i64 %1, 4
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_sext_i8_i64
+; CHECK:       sbfiz {{x[0-9]*}}, {{x[0-9]*}}, #4, #8
+define i64 @lsl_sext_i8_i64(i8 %b) {
+  %1 = sext i8 %b to i64
+  %2 = shl i64 %1, 4
+  ret i64 %2
+}
+
+; CHECK-LABEL: lslv_i16
+; CHECK:       and [[REG1:w[0-9]+]], w1, #0xffff
+; CHECK-NEXT:  lsl [[REG2:w[0-9]+]], w0, [[REG1]]
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG2]], #0xffff
+define zeroext i16 @lslv_i16(i16 %a, i16 %b) {
+  %1 = shl i16 %a, %b
+  ret i16 %1
+}
+
+; CHECK-LABEL: lsl_i16
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #8, #8
+define zeroext i16 @lsl_i16(i16 %a) {
+  %1 = shl i16 %a, 8
+  ret i16 %1
+}
+
+; CHECK-LABEL: lsl_zext_i16_i32
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #8, #16
+define i32 @lsl_zext_i16_i32(i16 %b) {
+  %1 = zext i16 %b to i32
+  %2 = shl i32 %1, 8
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_sext_i16_i32
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #8, #16
+define i32 @lsl_sext_i16_i32(i16 %b) {
+  %1 = sext i16 %b to i32
+  %2 = shl i32 %1, 8
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_zext_i16_i64
+; CHECK:       ubfiz {{x[0-9]*}}, {{x[0-9]*}}, #8, #16
+define i64 @lsl_zext_i16_i64(i16 %b) {
+  %1 = zext i16 %b to i64
+  %2 = shl i64 %1, 8
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_sext_i16_i64
+; CHECK:       sbfiz {{x[0-9]*}}, {{x[0-9]*}}, #8, #16
+define i64 @lsl_sext_i16_i64(i16 %b) {
+  %1 = sext i16 %b to i64
+  %2 = shl i64 %1, 8
+  ret i64 %2
+}
+
+; CHECK-LABEL: lslv_i32
+; CHECK:       lsl {{w[0-9]*}}, w0, w1
+define zeroext i32 @lslv_i32(i32 %a, i32 %b) {
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK-LABEL: lsl_i32
+; CHECK:       lsl {{w[0-9]*}}, {{w[0-9]*}}, #16
+define zeroext i32 @lsl_i32(i32 %a) {
+  %1 = shl i32 %a, 16
+  ret i32 %1
+}
+
+; CHECK-LABEL: lsl_zext_i32_i64
+; CHECK:       ubfiz {{x[0-9]+}}, {{x[0-9]+}}, #16, #32
+define i64 @lsl_zext_i32_i64(i32 %b) {
+  %1 = zext i32 %b to i64
+  %2 = shl i64 %1, 16
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_sext_i32_i64
+; CHECK:       sbfiz {{x[0-9]+}}, {{x[0-9]+}}, #16, #32
+define i64 @lsl_sext_i32_i64(i32 %b) {
+  %1 = sext i32 %b to i64
+  %2 = shl i64 %1, 16
+  ret i64 %2
+}
+
+; CHECK-LABEL: lslv_i64
+; CHECK:       lsl {{x[0-9]*}}, x0, x1
+define i64 @lslv_i64(i64 %a, i64 %b) {
+  %1 = shl i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK-LABEL: lsl_i64
+; CHECK:       lsl {{x[0-9]*}}, {{x[0-9]*}}, #32
+define i64 @lsl_i64(i64 %a) {
+  %1 = shl i64 %a, 32
+  ret i64 %1
+}
+
+; CHECK-LABEL: lsrv_i8
+; CHECK:       and [[REG1:w[0-9]+]], w0, #0xff
+; CHECK-NEXT:  and [[REG2:w[0-9]+]], w1, #0xff
+; CHECK-NEXT:  lsr [[REG3:w[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG3]], #0xff
+define zeroext i8 @lsrv_i8(i8 %a, i8 %b) {
+  %1 = lshr i8 %a, %b
+  ret i8 %1
+}
+
+; CHECK-LABEL: lsr_i8
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i8 @lsr_i8(i8 %a) {
+  %1 = lshr i8 %a, 4
+  ret i8 %1
+}
+
+; CHECK-LABEL: lsr_zext_i8_i16
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i16 @lsr_zext_i8_i16(i8 %b) {
+  %1 = zext i8 %b to i16
+  %2 = lshr i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsr_sext_i8_i16
+; CHECK:       sxtb [[REG:w[0-9]+]], w0
+; CHECK-NEXT:  ubfx {{w[0-9]*}}, [[REG]], #4, #12
+define signext i16 @lsr_sext_i8_i16(i8 %b) {
+  %1 = sext i8 %b to i16
+  %2 = lshr i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsr_zext_i8_i32
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define i32 @lsr_zext_i8_i32(i8 %b) {
+  %1 = zext i8 %b to i32
+  %2 = lshr i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsr_sext_i8_i32
+; CHECK:       sxtb [[REG:w[0-9]+]], w0
+; CHECK-NEXT:  lsr {{w[0-9]*}}, [[REG]], #4
+define i32 @lsr_sext_i8_i32(i8 %b) {
+  %1 = sext i8 %b to i32
+  %2 = lshr i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsrv_i16
+; CHECK:       and [[REG1:w[0-9]+]], w0, #0xffff
+; CHECK-NEXT:  and [[REG2:w[0-9]+]], w1, #0xffff
+; CHECK-NEXT:  lsr [[REG3:w[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG3]], #0xffff
+define zeroext i16 @lsrv_i16(i16 %a, i16 %b) {
+  %1 = lshr i16 %a, %b
+  ret i16 %1
+}
+
+; CHECK-LABEL: lsr_i16
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #8, #8
+define zeroext i16 @lsr_i16(i16 %a) {
+  %1 = lshr i16 %a, 8
+  ret i16 %1
+}
+
+; CHECK-LABEL: lsrv_i32
+; CHECK:       lsr {{w[0-9]*}}, w0, w1
+define zeroext i32 @lsrv_i32(i32 %a, i32 %b) {
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK-LABEL: lsr_i32
+; CHECK:       lsr {{w[0-9]*}}, {{w[0-9]*}}, #16
+define zeroext i32 @lsr_i32(i32 %a) {
+  %1 = lshr i32 %a, 16
+  ret i32 %1
+}
+
+; CHECK-LABEL: lsrv_i64
+; CHECK:       lsr {{x[0-9]*}}, x0, x1
+define i64 @lsrv_i64(i64 %a, i64 %b) {
+  %1 = lshr i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK-LABEL: lsr_i64
+; CHECK:       lsr {{x[0-9]*}}, {{x[0-9]*}}, #32
+define i64 @lsr_i64(i64 %a) {
+  %1 = lshr i64 %a, 32
+  ret i64 %1
+}
+
+; CHECK-LABEL: asrv_i8
+; CHECK:       sxtb [[REG1:w[0-9]+]], w0
+; CHECK-NEXT:  and  [[REG2:w[0-9]+]], w1, #0xff
+; CHECK-NEXT:  asr  [[REG3:w[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT:  and  {{w[0-9]+}}, [[REG3]], #0xff
+define zeroext i8 @asrv_i8(i8 %a, i8 %b) {
+  %1 = ashr i8 %a, %b
+  ret i8 %1
+}
+
+; CHECK-LABEL: asr_i8
+; CHECK:       sbfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i8 @asr_i8(i8 %a) {
+  %1 = ashr i8 %a, 4
+  ret i8 %1
+}
+
+; CHECK-LABEL: asr_zext_i8_i16
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i16 @asr_zext_i8_i16(i8 %b) {
+  %1 = zext i8 %b to i16
+  %2 = ashr i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: asr_sext_i8_i16
+; CHECK:       sbfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define signext i16 @asr_sext_i8_i16(i8 %b) {
+  %1 = sext i8 %b to i16
+  %2 = ashr i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: asr_zext_i8_i32
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define i32 @asr_zext_i8_i32(i8 %b) {
+  %1 = zext i8 %b to i32
+  %2 = ashr i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: asr_sext_i8_i32
+; CHECK:       sbfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define i32 @asr_sext_i8_i32(i8 %b) {
+  %1 = sext i8 %b to i32
+  %2 = ashr i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: asrv_i16
+; CHECK:       sxth [[REG1:w[0-9]+]], w0
+; CHECK-NEXT:  and  [[REG2:w[0-9]+]], w1, #0xffff
+; CHECK-NEXT:  asr  [[REG3:w[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT:  and  {{w[0-9]+}}, [[REG3]], #0xffff
+define zeroext i16 @asrv_i16(i16 %a, i16 %b) {
+  %1 = ashr i16 %a, %b
+  ret i16 %1
+}
+
+; CHECK-LABEL: asr_i16
+; CHECK:       sbfx {{w[0-9]*}}, {{w[0-9]*}}, #8, #8
+define zeroext i16 @asr_i16(i16 %a) {
+  %1 = ashr i16 %a, 8
+  ret i16 %1
+}
+
+; CHECK-LABEL: asrv_i32
+; CHECK:       asr {{w[0-9]*}}, w0, w1
+define zeroext i32 @asrv_i32(i32 %a, i32 %b) {
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK-LABEL: asr_i32
+; CHECK:       asr {{w[0-9]*}}, {{w[0-9]*}}, #16
+define zeroext i32 @asr_i32(i32 %a) {
+  %1 = ashr i32 %a, 16
+  ret i32 %1
+}
+
+; CHECK-LABEL: asrv_i64
+; CHECK:       asr {{x[0-9]*}}, x0, x1
+define i64 @asrv_i64(i64 %a, i64 %b) {
+  %1 = ashr i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK-LABEL: asr_i64
+; CHECK:       asr {{x[0-9]*}}, {{x[0-9]*}}, #32
+define i64 @asr_i64(i64 %a) {
+  %1 = ashr i64 %a, 32
+  ret i64 %1
+}
+
+; CHECK-LABEL: shift_test1
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+; CHECK-NEXT:  sbfx  {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define i32 @shift_test1(i8 %a) {
+  %1 = shl i8 %a, 4
+  %2 = ashr i8 %1, 4
+  %3 = sext i8 %2 to i32
+  ret i32 %3
+}
+
+; Test zero shifts
+
+; CHECK-LABEL: shl_zero
+; CHECK-NOT:   lsl
+define i32 @shl_zero(i32 %a) {
+  %1 = shl i32 %a, 0
+  ret i32 %1
+}
+
+; CHECK-LABEL: lshr_zero
+; CHECK-NOT:   lsr
+define i32 @lshr_zero(i32 %a) {
+  %1 = lshr i32 %a, 0
+  ret i32 %1
+}
+
+; CHECK-LABEL: ashr_zero
+; CHECK-NOT:   asr
+define i32 @ashr_zero(i32 %a) {
+  %1 = ashr i32 %a, 0
+  ret i32 %1
+}
+
+; CHECK-LABEL: shl_zext_zero
+; CHECK:       ubfx x0, x0, #0, #32
+define i64 @shl_zext_zero(i32 %a) {
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 0
+  ret i64 %2
+}
+
+; CHECK-LABEL: lshr_zext_zero
+; CHECK:       ubfx x0, x0, #0, #32
+define i64 @lshr_zext_zero(i32 %a) {
+  %1 = zext i32 %a to i64
+  %2 = lshr i64 %1, 0
+  ret i64 %2
+}
+
+; CHECK-LABEL: ashr_zext_zero
+; CHECK:       ubfx x0, x0, #0, #32
+define i64 @ashr_zext_zero(i32 %a) {
+  %1 = zext i32 %a to i64
+  %2 = ashr i64 %1, 0
+  ret i64 %2
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-sqrt.ll b/test/CodeGen/AArch64/fast-isel-sqrt.ll
new file mode 100644
index 0000000..1331d5c
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-sqrt.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+define float @test_sqrt_f32(float %a) {
+; CHECK-LABEL: test_sqrt_f32
+; CHECK:       fsqrt s0, s0
+  %res = call float @llvm.sqrt.f32(float %a)
+  ret float %res
+}
+declare float @llvm.sqrt.f32(float) nounwind readnone
+
+define double @test_sqrt_f64(double %a) {
+; CHECK-LABEL: test_sqrt_f64
+; CHECK:       fsqrt d0, d0
+  %res = call double @llvm.sqrt.f64(double %a)
+  ret double %res
+}
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+
diff --git a/test/CodeGen/AArch64/fast-isel-switch-phi.ll b/test/CodeGen/AArch64/fast-isel-switch-phi.ll
new file mode 100644
index 0000000..c4f871c
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-switch-phi.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s
+
+; Test that the Machine Instruction PHI node doesn't have more than one operand
+; from the same predecessor.
+define i32 @foo(i32 %a, i32 %b, i1 %c) {
+entry:
+  br i1 %c, label %switch, label %direct
+
+switch:
+  switch i32 %a, label %exit [
+    i32 43, label %continue
+    i32 45, label %continue
+  ]
+
+direct:
+  %var = add i32 %b, 1
+  br label %continue
+
+continue:
+  %var.phi = phi i32 [ %var, %direct ], [ 0, %switch ], [ 0, %switch ]
+  ret i32 %var.phi
+
+exit:
+  ret i32 1
+}
diff --git a/test/CodeGen/AArch64/fast-isel-tbz.ll b/test/CodeGen/AArch64/fast-isel-tbz.ll
new file mode 100644
index 0000000..d7f46b2
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-tbz.ll
@@ -0,0 +1,141 @@
+; RUN: llc                             -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+; RUN: llc -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+
+define i32 @icmp_eq_i8(i8 zeroext %a) {
+; CHECK-LABEL: icmp_eq_i8
+; CHECK:       tbz {{w[0-9]+}}, #0, {{LBB.+_2}}
+  %1 = and i8 %a, 1
+  %2 = icmp eq i8 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i16(i16 zeroext %a) {
+; CHECK-LABEL: icmp_eq_i16
+; CHECK:       tbz w0, #1, {{LBB.+_2}}
+  %1 = and i16 %a, 2
+  %2 = icmp eq i16 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i32(i32 %a) {
+; CHECK-LABEL: icmp_eq_i32
+; CHECK:       tbz w0, #2, {{LBB.+_2}}
+  %1 = and i32 %a, 4
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i64_1(i64 %a) {
+; CHECK-LABEL: icmp_eq_i64_1
+; CHECK:       tbz w0, #3, {{LBB.+_2}}
+  %1 = and i64 %a, 8
+  %2 = icmp eq i64 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i64_2(i64 %a) {
+; CHECK-LABEL: icmp_eq_i64_2
+; CHECK:       tbz x0, #32, {{LBB.+_2}}
+  %1 = and i64 %a, 4294967296
+  %2 = icmp eq i64 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i8(i8 zeroext %a) {
+; CHECK-LABEL: icmp_ne_i8
+; CHECK:       tbnz w0, #0, {{LBB.+_2}}
+  %1 = and i8 %a, 1
+  %2 = icmp ne i8 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i16(i16 zeroext %a) {
+; CHECK-LABEL: icmp_ne_i16
+; CHECK:       tbnz w0, #1, {{LBB.+_2}}
+  %1 = and i16 %a, 2
+  %2 = icmp ne i16 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i32(i32 %a) {
+; CHECK-LABEL: icmp_ne_i32
+; CHECK:       tbnz w0, #2, {{LBB.+_2}}
+  %1 = and i32 %a, 4
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i64_1(i64 %a) {
+; CHECK-LABEL: icmp_ne_i64_1
+; CHECK:       tbnz w0, #3, {{LBB.+_2}}
+  %1 = and i64 %a, 8
+  %2 = icmp ne i64 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i64_2(i64 %a) {
+; CHECK-LABEL: icmp_ne_i64_2
+; CHECK:       tbnz x0, #32, {{LBB.+_2}}
+  %1 = and i64 %a, 4294967296
+  %2 = icmp ne i64 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+; Test that we don't fold the 'and' instruction into the compare.
+define i32 @icmp_eq_and_i32(i32 %a, i1 %c) {
+; CHECK-LABEL: icmp_eq_and_i32
+; CHECK:       and  [[REG:w[0-9]+]], w0, #0x4
+; CHECK-NEXT:  cbz  [[REG]], {{LBB.+_3}}
+  %1 = and i32 %a, 4
+  br i1 %c, label %bb0, label %bb2
+bb0:
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}
+!1 = metadata !{metadata !"branch_weights", i32 2147483647, i32 0}
diff --git a/test/CodeGen/AArch64/fast-isel-trunc.ll b/test/CodeGen/AArch64/fast-isel-trunc.ll
new file mode 100644
index 0000000..55937eb
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-trunc.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s
+
+; Test that %1 doesn't get the kill flag set before its last use.
+define i32 @test_trunc(i32 %a) {
+  %1 = add i32 %a, 1
+  %2 = trunc i32 %1 to i16
+  %3 = icmp ult i16 1, %2
+  %4 = add i32 %1, 1
+  %5 = sext i1 %3 to i32
+  %6 = and i32 %4, %5
+  ret i32 %6
+}
diff --git a/test/CodeGen/AArch64/fast-isel-vector-arithmetic.ll b/test/CodeGen/AArch64/fast-isel-vector-arithmetic.ll
new file mode 100644
index 0000000..eaa0db5
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-vector-arithmetic.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=aarch64-apple-darwin                                                   -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -fast-isel-abort-args -verify-machineinstrs < %s | FileCheck %s
+
+; Vector Integer Add
+define <8 x i8> @add_v8i8_rr(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: add_v8i8_rr
+; CHECK: add.8b v0, v0, v1
+  %1 = add <8 x i8> %a, %b
+  ret <8 x i8> %1
+}
+
+define <16 x i8> @add_v16i8_rr(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: add_v16i8_rr
+; CHECK: add.16b v0, v0, v1
+  %1 = add <16 x i8> %a, %b
+  ret <16 x i8> %1
+}
+
+define <4 x i16> @add_v4i16_rr(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: add_v4i16_rr
+; CHECK: add.4h v0, v0, v1
+  %1 = add <4 x i16> %a, %b
+  ret <4 x i16> %1
+}
+
+define <8 x i16> @add_v8i16_rr(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: add_v8i16_rr
+; CHECK: add.8h v0, v0, v1
+  %1 = add <8 x i16> %a, %b
+  ret <8 x i16> %1
+}
+
+define <2 x i32> @add_v2i32_rr(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: add_v2i32_rr
+; CHECK: add.2s v0, v0, v1
+  %1 = add <2 x i32> %a, %b
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @add_v4i32_rr(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: add_v4i32_rr
+; CHECK: add.4s v0, v0, v1
+  %1 = add <4 x i32> %a, %b
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @add_v2i64_rr(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: add_v2i64_rr
+; CHECK: add.2d v0, v0, v1
+  %1 = add <2 x i64> %a, %b
+  ret <2 x i64> %1
+}
+
+; Vector Floating-point Add
+define <2 x float> @add_v2f32_rr(<2 x float> %a, <2 x float> %b) {
+; CHECK: add_v2f32_rr
+; CHECK: fadd.2s v0, v0, v1
+  %1 = fadd <2 x float> %a, %b
+  ret <2 x float> %1
+}
+
+define <4 x float> @add_v4f32_rr(<4 x float> %a, <4 x float> %b) {
+; CHECK: add_v4f32_rr
+; CHECK: fadd.4s v0, v0, v1
+  %1 = fadd <4 x float> %a, %b
+  ret <4 x float> %1
+}
+
+define <2 x double> @add_v2f64_rr(<2 x double> %a, <2 x double> %b) {
+; CHECK: add_v2f64_rr
+; CHECK: fadd.2d v0, v0, v1
+  %1 = fadd <2 x double> %a, %b
+  ret <2 x double> %1
+}
diff --git a/test/CodeGen/AArch64/fast-isel-vret.ll b/test/CodeGen/AArch64/fast-isel-vret.ll
new file mode 100644
index 0000000..9ad9227
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-vret.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; Test that we don't abort fast-isle for ret
+define <8 x i8> @ret_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: ret_v8i8
+; CHECK:       add.8b v0, v0, v1
+  %1 = add <8 x i8> %a, %b
+  ret <8 x i8> %1
+}
diff --git a/test/CodeGen/AArch64/fp16-instructions.ll b/test/CodeGen/AArch64/fp16-instructions.ll
new file mode 100644
index 0000000..7a44cd1
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-instructions.ll
@@ -0,0 +1,109 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+define half @add_h(half %a, half %b) {
+entry:
+; CHECK-LABEL: add_h:
+; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
+; CHECK: fadd [[RES:s[0-9]+]], [[OP1]], [[OP2]]
+; CHECK: fcvt h0, [[RES]]
+  %0 = fadd half %a, %b
+  ret half %0
+}
+
+
+define half @sub_h(half %a, half %b) {
+entry:
+; CHECK-LABEL: sub_h:
+; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
+; CHECK: fsub [[RES:s[0-9]+]], [[OP1]], [[OP2]]
+; CHECK: fcvt h0, [[RES]]
+  %0 = fsub half %a, %b
+  ret half %0
+}
+
+
+define half @mul_h(half %a, half %b) {
+entry:
+; CHECK-LABEL: mul_h:
+; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
+; CHECK: fmul [[RES:s[0-9]+]], [[OP1]], [[OP2]]
+; CHECK: fcvt h0, [[RES]]
+  %0 = fmul half %a, %b
+  ret half %0
+}
+
+
+define half @div_h(half %a, half %b) {
+entry:
+; CHECK-LABEL: div_h:
+; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
+; CHECK: fdiv [[RES:s[0-9]+]], [[OP1]], [[OP2]]
+; CHECK: fcvt h0, [[RES]]
+  %0 = fdiv half %a, %b
+  ret half %0
+}
+
+
+define half @load_h(half* %a) {
+entry:
+; CHECK-LABEL: load_h:
+; CHECK: ldr h0, [x0]
+  %0 = load half* %a, align 4
+  ret half %0
+}
+
+
+define void @store_h(half* %a, half %b) {
+entry:
+; CHECK-LABEL: store_h:
+; CHECK: str h0, [x0]
+  store half %b, half* %a, align 4
+  ret void
+}
+
+define half @s_to_h(float %a) {
+; CHECK-LABEL: s_to_h:
+; CHECK: fcvt h0, s0
+  %1 = fptrunc float %a to half
+  ret half %1
+}
+
+define half @d_to_h(double %a) {
+; CHECK-LABEL: d_to_h:
+; CHECK: fcvt h0, d0
+  %1 = fptrunc double %a to half
+  ret half %1
+}
+
+define float @h_to_s(half %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK: fcvt s0, h0
+  %1 = fpext half %a to float
+  ret float %1
+}
+
+define double @h_to_d(half %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK: fcvt d0, h0
+  %1 = fpext half %a to double
+  ret double %1
+}
+
+define half @bitcast_i_to_h(i16 %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK: fmov s0, w0
+  %1 = bitcast i16 %a to half
+  ret half %1
+}
+
+
+define i16 @bitcast_h_to_i(half %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK: fmov w0, s0
+  %1 = bitcast half %a to i16
+  ret i16 %1
+}
diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll
new file mode 100644
index 0000000..8e89681
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -0,0 +1,122 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+define <4 x half> @add_h(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: add_h:
+; CHECK-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h
+; CHECK: fadd [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]]
+; CHECK: fcvtn v0.4h, [[RES]]
+  %0 = fadd <4 x half> %a, %b
+  ret <4 x half> %0
+}
+
+
+define <4 x half> @sub_h(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: sub_h:
+; CHECK-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h
+; CHECK: fsub [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]]
+; CHECK: fcvtn v0.4h, [[RES]]
+  %0 = fsub <4 x half> %a, %b
+  ret <4 x half> %0
+}
+
+
+define <4 x half> @mul_h(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: mul_h:
+; CHECK-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h
+; CHECK: fmul [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]]
+; CHECK: fcvtn v0.4h, [[RES]]
+  %0 = fmul <4 x half> %a, %b
+  ret <4 x half> %0
+}
+
+
+define <4 x half> @div_h(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: div_h:
+; CHECK-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h
+; CHECK: fdiv [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]]
+; CHECK: fcvtn v0.4h, [[RES]]
+  %0 = fdiv <4 x half> %a, %b
+  ret <4 x half> %0
+}
+
+
+define <4 x half> @load_h(<4 x half>* %a) {
+entry:
+; CHECK-LABEL: load_h:
+; CHECK: ldr d0, [x0]
+  %0 = load <4 x half>* %a, align 4
+  ret <4 x half> %0
+}
+
+
+define void @store_h(<4 x half>* %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: store_h:
+; CHECK: str d0, [x0]
+  store <4 x half> %b, <4 x half>* %a, align 4
+  ret void
+}
+
+define <4 x half> @s_to_h(<4 x float> %a) {
+; CHECK-LABEL: s_to_h:
+; CHECK: fcvtn v0.4h, v0.4s
+  %1 = fptrunc <4 x float> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @d_to_h(<4 x double> %a) {
+; CHECK-LABEL: d_to_h:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+  %1 = fptrunc <4 x double> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x float> @h_to_s(<4 x half> %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK: fcvtl v0.4s, v0.4h
+  %1 = fpext <4 x half> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @h_to_d(<4 x half> %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+  %1 = fpext <4 x half> %a to <4 x double>
+  ret <4 x double> %1
+}
+
+define <4 x half> @bitcast_i_to_h(float, <4 x i16> %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK: mov v0.16b, v1.16b
+  %2 = bitcast <4 x i16> %a to <4 x half>
+  ret <4 x half> %2
+}
+
+define <4 x i16> @bitcast_h_to_i(float, <4 x half> %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK: mov v0.16b, v1.16b
+  %2 = bitcast <4 x half> %a to <4 x i16>
+  ret <4 x i16> %2
+}
diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll
new file mode 100644
index 0000000..9ee2296
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -0,0 +1,255 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+define <8 x half> @add_h(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: add_h:
+; CHECK: fcvt
+; CHECK: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK: fcvt
+  %0 = fadd <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+
+define <8 x half> @sub_h(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: sub_h:
+; CHECK: fcvt
+; CHECK: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK: fcvt
+  %0 = fsub <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+
+define <8 x half> @mul_h(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: mul_h:
+; CHECK: fcvt
+; CHECK: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK: fcvt
+  %0 = fmul <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+
+define <8 x half> @div_h(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: div_h:
+; CHECK: fcvt
+; CHECK: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK: fcvt
+  %0 = fdiv <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+
+define <8 x half> @load_h(<8 x half>* %a) {
+entry:
+; CHECK-LABEL: load_h:
+; CHECK: ldr q0, [x0]
+  %0 = load <8 x half>* %a, align 4
+  ret <8 x half> %0
+}
+
+
+define void @store_h(<8 x half>* %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: store_h:
+; CHECK: str q0, [x0]
+  store <8 x half> %b, <8 x half>* %a, align 4
+  ret void
+}
+
+define <8 x half> @s_to_h(<8 x float> %a) {
+; CHECK-LABEL: s_to_h:
+; CHECK-DAG: fcvtn v0.4h, v0.4s
+; CHECK-DAG: fcvtn [[REG:v[0-9+]]].4h, v1.4s
+; CHECK: ins v0.d[1], [[REG]].d[0]
+  %1 = fptrunc <8 x float> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @d_to_h(<8 x double> %a) {
+; CHECK-LABEL: d_to_h:
+; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+  %1 = fptrunc <8 x double> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x float> @h_to_s(<8 x half> %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK: fcvtl2 v1.4s, v0.8h
+; CHECK: fcvtl v0.4s, v0.4h
+  %1 = fpext <8 x half> %a to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x double> @h_to_d(<8 x half> %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+  %1 = fpext <8 x half> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+
+define <8 x half> @bitcast_i_to_h(float, <8 x i16> %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK: mov v0.16b, v1.16b
+  %2 = bitcast <8 x i16> %a to <8 x half>
+  ret <8 x half> %2
+}
+
+define <8 x i16> @bitcast_h_to_i(float, <8 x half> %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK: mov v0.16b, v1.16b
+  %2 = bitcast <8 x half> %a to <8 x i16>
+  ret <8 x i16> %2
+}
+
diff --git a/test/CodeGen/AArch64/fp16-vector-bitcast.ll b/test/CodeGen/AArch64/fp16-vector-bitcast.ll
new file mode 100644
index 0000000..421a4f5
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-vector-bitcast.ll
@@ -0,0 +1,203 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+define <4 x i16> @v4f16_to_v4i16(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v4i16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @v4f16_to_v2i32(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v2i32:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <1 x i64> @v4f16_to_v1i64(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v1i64:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <1 x i64>
+  ret <1 x i64> %1
+}
+
+define i64 @v4f16_to_i64(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_i64:
+; CHECK: fmov x0, d1
+entry:
+  %1 = bitcast <4 x half> %a to i64
+  ret i64 %1
+}
+
+define <2 x float> @v4f16_to_v2float(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v2float:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <2 x float>
+  ret <2 x float> %1
+}
+
+define <1 x double> @v4f16_to_v1double(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v1double:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <1 x double>
+  ret <1 x double> %1
+}
+
+define double @v4f16_to_double(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_double:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to double
+  ret double %1
+}
+
+
+define <4 x half> @v4i16_to_v4f16(float, <4 x i16> %a) #0 {
+; CHECK-LABEL: v4i16_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x i16> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @v2i32_to_v4f16(float, <2 x i32> %a) #0 {
+; CHECK-LABEL: v2i32_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <2 x i32> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @v1i64_to_v4f16(float, <1 x i64> %a) #0 {
+; CHECK-LABEL: v1i64_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <1 x i64> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @i64_to_v4f16(float, i64 %a) #0 {
+; CHECK-LABEL: i64_to_v4f16:
+; CHECK: fmov d0, x0
+entry:
+  %1 = bitcast i64 %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @v2float_to_v4f16(float, <2 x float> %a) #0 {
+; CHECK-LABEL: v2float_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <2 x float> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @v1double_to_v4f16(float, <1 x double> %a) #0 {
+; CHECK-LABEL: v1double_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <1 x double> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @double_to_v4f16(float, double %a) #0 {
+; CHECK-LABEL: double_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast double %a to <4 x half>
+  ret <4 x half> %1
+}
+
+
+
+
+
+
+
+
+
+
+define <8 x i16> @v8f16_to_v8i16(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v8i16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @v8f16_to_v4i32(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v4i32:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @v8f16_to_v2i64(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v2i64:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <4 x float> @v8f16_to_v4float(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v4float:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <2 x double> @v8f16_to_v2double(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v2double:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <2 x double>
+  ret <2 x double> %1
+}
+
+define <8 x half> @v8i16_to_v8f16(float, <8 x i16> %a) #0 {
+; CHECK-LABEL: v8i16_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x i16> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @v4i32_to_v8f16(float, <4 x i32> %a) #0 {
+; CHECK-LABEL: v4i32_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x i32> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @v2i64_to_v8f16(float, <2 x i64> %a) #0 {
+; CHECK-LABEL: v2i64_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <2 x i64> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @v4float_to_v8f16(float, <4 x float> %a) #0 {
+; CHECK-LABEL: v4float_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x float> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @v2double_to_v8f16(float, <2 x double> %a) #0 {
+; CHECK-LABEL: v2double_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <2 x double> %a to <8 x half>
+  ret <8 x half> %1
+}
diff --git a/test/CodeGen/AArch64/fp16-vector-load-store.ll b/test/CodeGen/AArch64/fp16-vector-load-store.ll
new file mode 100644
index 0000000..edbbffe
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-vector-load-store.ll
@@ -0,0 +1,528 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+; Simple load of v4i16
+define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 {
+; CHECK-LABEL: load_64:
+; CHECK: ldr d0, [x0]
+entry:
+  %0 = load <4 x half>* %a, align 8
+  ret <4 x half> %0
+}
+
+; Simple load of v8i16
+define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 {
+; CHECK-LABEL: load_128:
+; CHECK: ldr q0, [x0]
+entry:
+  %0 = load <8 x half>* %a, align 16
+  ret <8 x half> %0
+}
+
+; Duplicating load to v4i16
+define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 {
+; CHECK-LABEL: load_dup_64:
+; CHECK: ld1r { v0.4h }, [x0]
+entry:
+  %0 = load half* %a, align 2
+  %1 = insertelement <4 x half> undef, half %0, i32 0
+  %2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer
+  ret <4 x half> %2
+}
+
+; Duplicating load to v8i16
+define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 {
+; CHECK-LABEL: load_dup_128:
+; CHECK: ld1r { v0.8h }, [x0]
+entry:
+  %0 = load half* %a, align 2
+  %1 = insertelement <8 x half> undef, half %0, i32 0
+  %2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer
+  ret <8 x half> %2
+}
+
+; Load to one lane of v4f16
+define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 {
+; CHECK-LABEL: load_lane_64:
+; CHECK: ld1 { v0.h }[2], [x0]
+entry:
+  %0 = load half* %a, align 2
+  %1 = insertelement <4 x half> %b, half %0, i32 2
+  ret <4 x half> %1
+}
+
+; Load to one lane of v8f16
+define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 {
+; CHECK-LABEL: load_lane_128:
+; CHECK: ld1 { v0.h }[5], [x0]
+entry:
+  %0 = load half* %a, align 2
+  %1 = insertelement <8 x half> %b, half %0, i32 5
+  ret <8 x half> %1
+}
+
+; Simple store of v4f16
+define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 {
+; CHECK-LABEL: store_64:
+; CHECK: str d0, [x0]
+entry:
+  store <4 x half> %b, <4 x half>* %a, align 8
+  ret void
+}
+
+; Simple store of v8f16
+define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 {
+; CHECK-LABEL: store_128:
+; CHECK: str q0, [x0]
+entry:
+  store <8 x half> %b, <8 x half>* %a, align 16
+  ret void
+}
+
+; Store from one lane of v4f16
+define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 {
+; CHECK-LABEL: store_lane_64:
+; CHECK: st1 { v0.h }[2], [x0]
+entry:
+  %0 = extractelement <4 x half> %b, i32 2
+  store half %0, half* %a, align 2
+  ret void
+}
+
+; Store from one lane of v8f16
+define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 {
+; CHECK-LABEL: store_lane_128:
+; CHECK: st1 { v0.h }[5], [x0]
+entry:
+  %0 = extractelement <8 x half> %b, i32 5
+  store half %0, half* %a, align 2
+  ret void
+}
+
+; NEON intrinsics - (de-)interleaving loads and stores
+declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*)
+declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*)
+declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*)
+declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
+declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
+declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
+declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*)
+declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*)
+declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*)
+declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
+declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
+declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
+
+; Load 2 x v4f16 with de-interleaving
+define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_64_2:
+; CHECK: ld2 { v0.4h, v1.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half> } %0
+}
+
+; Load 3 x v4f16 with de-interleaving
+define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_64_3:
+; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load 4 x v4f16 with de-interleaving
+define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_64_4:
+; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Store 2 x v4f16 with interleaving
+define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: store_interleave_64_2:
+; CHECK: st2 { v0.4h, v1.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
+  ret void
+}
+
+; Store 3 x v4f16 with interleaving
+define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
+; CHECK-LABEL: store_interleave_64_3:
+; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
+  ret void
+}
+
+; Store 4 x v4f16 with interleaving
+define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
+; CHECK-LABEL: store_interleave_64_4:
+; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
+  ret void
+}
+
+; Load 2 x v8f16 with de-interleaving
+define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_128_2:
+; CHECK: ld2 { v0.8h, v1.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half> } %0
+}
+
+; Load 3 x v8f16 with de-interleaving
+define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_128_3:
+; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Load 8 x v8f16 with de-interleaving
+define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_128_4:
+; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Store 2 x v8f16 with interleaving
+define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: store_interleave_128_2:
+; CHECK: st2 { v0.8h, v1.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
+  ret void
+}
+
+; Store 3 x v8f16 with interleaving
+define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
+; CHECK-LABEL: store_interleave_128_3:
+; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
+  ret void
+}
+
+; Store 8 x v8f16 with interleaving
+define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
+; CHECK-LABEL: store_interleave_128_4:
+; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
+  ret void
+}
+
+; NEON intrinsics - duplicating loads
+declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*)
+declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*)
+declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*)
+declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*)
+declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*)
+declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*)
+
+; Load 2 x v4f16 with duplication
+define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 {
+; CHECK-LABEL: load_dup_64_2:
+; CHECK: ld2r { v0.4h, v1.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a)
+  ret { <4 x half>, <4 x half> } %0
+}
+
+; Load 3 x v4f16 with duplication
+define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 {
+; CHECK-LABEL: load_dup_64_3:
+; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a)
+  ret { <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load 4 x v4f16 with duplication
+define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 {
+; CHECK-LABEL: load_dup_64_4:
+; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a)
+  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load 2 x v8f16 with duplication
+define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 {
+; CHECK-LABEL: load_dup_128_2:
+; CHECK: ld2r { v0.8h, v1.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a)
+  ret { <8 x half>, <8 x half> } %0
+}
+
+; Load 3 x v8f16 with duplication
+define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 {
+; CHECK-LABEL: load_dup_128_3:
+; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a)
+  ret { <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Load 8 x v8f16 with duplication
+define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 {
+; CHECK-LABEL: load_dup_128_4:
+; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a)
+  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+
+; NEON intrinsics - loads and stores to/from one lane
+declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
+declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
+declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
+declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
+declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
+declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
+
+; Load one lane of 2 x v4f16
+define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: load_lane_64_2:
+; CHECK: ld2 { v0.h, v1.h }[2], [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
+  ret { <4 x half>, <4 x half> } %0
+}
+
+; Load one lane of 3 x v4f16
+define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
+; CHECK-LABEL: load_lane_64_3:
+; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
+  ret { <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load one lane of 4 x v4f16
+define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
+; CHECK-LABEL: load_lane_64_4:
+; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
+  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Store one lane of 2 x v4f16
+define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: store_lane_64_2:
+; CHECK: st2 { v0.h, v1.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
+  ret void
+}
+
+; Store one lane of 3 x v4f16
+define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
+; CHECK-LABEL: store_lane_64_3:
+; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
+  ret void
+}
+
+; Store one lane of 4 x v4f16
+define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
+; CHECK-LABEL: store_lane_64_4:
+; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
+  ret void
+}
+
+; Load one lane of 2 x v8f16
+define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: load_lane_128_2:
+; CHECK: ld2 { v0.h, v1.h }[2], [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
+  ret { <8 x half>, <8 x half> } %0
+}
+
+; Load one lane of 3 x v8f16
+define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
+; CHECK-LABEL: load_lane_128_3:
+; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
+  ret { <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Load one lane of 8 x v8f16
+define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
+; CHECK-LABEL: load_lane_128_4:
+; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
+  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Store one lane of 2 x v8f16
+define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: store_lane_128_2:
+; CHECK: st2 { v0.h, v1.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
+  ret void
+}
+
+; Store one lane of 3 x v8f16
+define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
+; CHECK-LABEL: store_lane_128_3:
+; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
+  ret void
+}
+
+; Store one lane of 8 x v8f16
+define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
+; CHECK-LABEL: store_lane_128_4:
+; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
+  ret void
+}
+
+; NEON intrinsics - load/store without interleaving
+declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*)
+declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*)
+declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*)
+declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
+declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
+declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
+declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*)
+declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*)
+declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*)
+declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
+declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
+declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
+
+; Load 2 x v4f16 without de-interleaving
+define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_64_2:
+; CHECK: ld1 { v0.4h, v1.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half> } %0
+}
+
+; Load 3 x v4f16 without de-interleaving
+define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_64_3:
+; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load 4 x v4f16 without de-interleaving
+define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_64_4:
+; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Store 2 x v4f16 without interleaving
+define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: store_64_2:
+; CHECK: st1 { v0.4h, v1.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
+  ret void
+}
+
+; Store 3 x v4f16 without interleaving
+define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
+; CHECK-LABEL: store_64_3:
+; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
+  ret void
+}
+
+; Store 4 x v4f16 without interleaving
+define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
+; CHECK-LABEL: store_64_4:
+; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
+  ret void
+}
+
+; Load 2 x v8f16 without de-interleaving
+define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_128_2:
+; CHECK: ld1 { v0.8h, v1.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half> } %0
+}
+
+; Load 3 x v8f16 without de-interleaving
+define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_128_3:
+; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Load 8 x v8f16 without de-interleaving
+define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_128_4:
+; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Store 2 x v8f16 without interleaving
+define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: store_128_2:
+; CHECK: st1 { v0.8h, v1.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
+  ret void
+}
+
+; Store 3 x v8f16 without interleaving
+define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
+; CHECK-LABEL: store_128_3:
+; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
+  ret void
+}
+
+; Store 8 x v8f16 without interleaving
+define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
+; CHECK-LABEL: store_128_4:
+; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/test/CodeGen/AArch64/fp16-vector-shuffle.ll
new file mode 100644
index 0000000..74d1b43
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-vector-shuffle.ll
@@ -0,0 +1,301 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+; float16x4_t select_64(float16x4_t a, float16x4_t b, uint16x4_t c) { return vbsl_u16(c, a, b); }
+define <4 x half> @select_64(<4 x half> %a, <4 x half> %b, <4 x i16> %c) #0 {
+; CHECK-LABEL: select_64:
+; CHECK: bsl
+entry:
+  %0 = bitcast <4 x half> %a to <4 x i16>
+  %1 = bitcast <4 x half> %b to <4 x i16>
+  %vbsl3.i = and <4 x i16> %0, %c
+  %2 = xor <4 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1>
+  %vbsl4.i = and <4 x i16> %1, %2
+  %vbsl5.i = or <4 x i16> %vbsl3.i, %vbsl4.i
+  %3 = bitcast <4 x i16> %vbsl5.i to <4 x half>
+  ret <4 x half> %3
+}
+
+; float16x8_t select_128(float16x8_t a, float16x8_t b, uint16x8_t c) { return vbslq_u16(c, a, b); }
+define <8 x half> @select_128(<8 x half> %a, <8 x half> %b, <8 x i16> %c) #0 {
+; CHECK-LABEL: select_128:
+; CHECK: bsl
+entry:
+  %0 = bitcast <8 x half> %a to <8 x i16>
+  %1 = bitcast <8 x half> %b to <8 x i16>
+  %vbsl3.i = and <8 x i16> %0, %c
+  %2 = xor <8 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %vbsl4.i = and <8 x i16> %1, %2
+  %vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i
+  %3 = bitcast <8 x i16> %vbsl5.i to <8 x half>
+  ret <8 x half> %3
+}
+
+; float16x4_t lane_64_64(float16x4_t a, float16x4_t b) {
+;  return vcopy_lane_s16(a, 1, b, 2);
+; }
+define <4 x half> @lane_64_64(<4 x half> %a, <4 x half> %b) #0 {
+; CHECK-LABEL: lane_64_64:
+; CHECK: ins
+entry:
+  %0 = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x half> %0
+}
+
+; float16x8_t lane_128_64(float16x8_t a, float16x4_t b) {
+;   return vcopyq_lane_s16(a, 1, b, 2);
+; }
+define <8 x half> @lane_128_64(<8 x half> %a, <4 x half> %b) #0 {
+; CHECK-LABEL: lane_128_64:
+; CHECK: ins
+entry:
+  %0 = bitcast <4 x half> %b to <4 x i16>
+  %vget_lane = extractelement <4 x i16> %0, i32 2
+  %1 = bitcast <8 x half> %a to <8 x i16>
+  %vset_lane = insertelement <8 x i16> %1, i16 %vget_lane, i32 1
+  %2 = bitcast <8 x i16> %vset_lane to <8 x half>
+  ret <8 x half> %2
+}
+
+; float16x4_t lane_64_128(float16x4_t a, float16x8_t b) {
+;   return vcopy_laneq_s16(a, 3, b, 5);
+; }
+define <4 x half> @lane_64_128(<4 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: lane_64_128:
+; CHECK: ins
+entry:
+  %0 = bitcast <8 x half> %b to <8 x i16>
+  %vgetq_lane = extractelement <8 x i16> %0, i32 5
+  %1 = bitcast <4 x half> %a to <4 x i16>
+  %vset_lane = insertelement <4 x i16> %1, i16 %vgetq_lane, i32 3
+  %2 = bitcast <4 x i16> %vset_lane to <4 x half>
+  ret <4 x half> %2
+}
+
+; float16x8_t lane_128_128(float16x8_t a, float16x8_t b) {
+;   return vcopyq_laneq_s16(a, 3, b, 5);
+; }
+define <8 x half> @lane_128_128(<8 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: lane_128_128:
+; CHECK: ins
+entry:
+  %0 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x half> %0
+}
+
+; float16x4_t ext_64(float16x4_t a, float16x4_t b) {
+;   return vext_s16(a, b, 3);
+; }
+define <4 x half> @ext_64(<4 x half> %a, <4 x half> %b) #0 {
+; CHECK-LABEL: ext_64:
+; CHECK: ext
+entry:
+  %0 = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x half> %0
+}
+
+; float16x8_t ext_128(float16x8_t a, float16x8_t b) {
+;   return vextq_s16(a, b, 3);
+; }
+define <8 x half> @ext_128(<8 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: ext_128:
+; CHECK: ext
+entry:
+  %0 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  ret <8 x half> %0
+}
+
+; float16x4_t rev32_64(float16x4_t a) {
+;   return vrev32_s16(a);
+; }
+define <4 x half> @rev32_64(<4 x half> %a) #0 {
+entry:
+; CHECK-LABEL: rev32_64:
+; CHECK: rev32
+  %0 = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x half> %0
+}
+
+; float16x4_t rev64_64(float16x4_t a) {
+;   return vrev64_s16(a);
+; }
+define <4 x half> @rev64_64(<4 x half> %a) #0 {
+entry:
+; CHECK-LABEL: rev64_64:
+; CHECK: rev64
+  %0 = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x half> %0
+}
+
+; float16x8_t rev32_128(float16x8_t a) {
+;   return vrev32q_s16(a);
+; }
+define <8 x half> @rev32_128(<8 x half> %a) #0 {
+entry:
+; CHECK-LABEL: rev32_128:
+; CHECK: rev32
+  %0 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x half> %0
+}
+
+; float16x8_t rev64_128(float16x8_t a) {
+;   return vrev64q_s16(a);
+; }
+define <8 x half> @rev64_128(<8 x half> %a) #0 {
+entry:
+; CHECK-LABEL: rev64_128:
+; CHECK: rev64
+  %0 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x half> %0
+}
+
+; float16x4_t create_64(long long a) { return vcreate_f16(a); }
+define <4 x half> @create_64(i64 %a) #0 {
+; CHECK-LABEL: create_64:
+; CHECK: fmov
+entry:
+  %0 = bitcast i64 %a to <4 x half>
+  ret <4 x half> %0
+}
+
+; float16x4_t dup_64(__fp16 a) { return vdup_n_f16(a); }
+define <4 x half> @dup_64(half %a) #0 {
+; CHECK-LABEL: dup_64:
+; CHECK: dup
+entry:
+  %vecinit = insertelement <4 x half> undef, half %a, i32 0
+  %vecinit1 = insertelement <4 x half> %vecinit, half %a, i32 1
+  %vecinit2 = insertelement <4 x half> %vecinit1, half %a, i32 2
+  %vecinit3 = insertelement <4 x half> %vecinit2, half %a, i32 3
+  ret <4 x half> %vecinit3
+}
+
+; float16x8_t dup_128(__fp16 a) { return vdupq_n_f16(a); }
+define <8 x half> @dup_128(half %a) #0 {
+entry:
+; CHECK-LABEL: dup_128:
+; CHECK: dup
+  %vecinit = insertelement <8 x half> undef, half %a, i32 0
+  %vecinit1 = insertelement <8 x half> %vecinit, half %a, i32 1
+  %vecinit2 = insertelement <8 x half> %vecinit1, half %a, i32 2
+  %vecinit3 = insertelement <8 x half> %vecinit2, half %a, i32 3
+  %vecinit4 = insertelement <8 x half> %vecinit3, half %a, i32 4
+  %vecinit5 = insertelement <8 x half> %vecinit4, half %a, i32 5
+  %vecinit6 = insertelement <8 x half> %vecinit5, half %a, i32 6
+  %vecinit7 = insertelement <8 x half> %vecinit6, half %a, i32 7
+  ret <8 x half> %vecinit7
+}
+
+; float16x4_t dup_lane_64(float16x4_t a) { return vdup_lane_f16(a, 2); }
+define <4 x half> @dup_lane_64(<4 x half> %a) #0 {
+entry:
+; CHECK-LABEL: dup_lane_64:
+; CHECK: dup
+  %shuffle = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x half> %shuffle
+}
+
+; float16x8_t dup_lane_128(float16x4_t a) { return vdupq_lane_f16(a, 2); }
+define <8 x half> @dup_lane_128(<4 x half> %a) #0 {
+entry:
+; CHECK-LABEL: dup_lane_128:
+; CHECK: dup
+  %shuffle = shufflevector <4 x half> %a, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x half> %shuffle
+}
+
+; float16x4_t dup_laneq_64(float16x8_t a) { return vdup_laneq_f16(a, 2); }
+define <4 x half> @dup_laneq_64(<8 x half> %a) #0 {
+entry:
+; CHECK-LABEL: dup_laneq_64:
+; CHECK: dup
+  %shuffle = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x half> %shuffle
+}
+
+; float16x8_t dup_laneq_128(float16x8_t a) { return vdupq_laneq_f16(a, 2); }
+define <8 x half> @dup_laneq_128(<8 x half> %a) #0 {
+entry:
+; CHECK-LABEL: dup_laneq_128:
+; CHECK: dup
+  %shuffle = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x half> %shuffle
+}
+
+; float16x8_t vcombine(float16x4_t a, float16x4_t b) { return vcombine_f16(a, b); }
+define <8 x half> @vcombine(<4 x half> %a, <4 x half> %b) #0 {
+entry:
+; CHECK-LABEL: vcombine:
+; CHECK: ins
+  %shuffle.i = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x half> %shuffle.i
+}
+
+; float16x4_t get_high(float16x8_t a) { return vget_high_f16(a); }
+define <4 x half> @get_high(<8 x half> %a) #0 {
+; CHECK-LABEL: get_high:
+; CHECK: ext
+entry:
+  %shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x half> %shuffle.i
+}
+
+
+; float16x4_t get_low(float16x8_t a) { return vget_low_f16(a); }
+define <4 x half> @get_low(<8 x half> %a) #0 {
+; CHECK-LABEL: get_low:
+; CHECK-NOT: ext
+entry:
+  %shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x half> %shuffle.i
+}
+
+; float16x4_t set_lane_64(float16x4_t a, __fp16 b) { return vset_lane_f16(b, a, 2); }
+define <4 x half> @set_lane_64(<4 x half> %a, half %b) #0 {
+; CHECK-LABEL: set_lane_64:
+; CHECK: fmov
+; CHECK: ins
+entry:
+  %0 = bitcast half %b to i16
+  %1 = bitcast <4 x half> %a to <4 x i16>
+  %vset_lane = insertelement <4 x i16> %1, i16 %0, i32 2
+  %2 = bitcast <4 x i16> %vset_lane to <4 x half>
+  ret <4 x half> %2
+}
+
+
+; float16x8_t set_lane_128(float16x8_t a, __fp16 b) { return vsetq_lane_f16(b, a, 2); }
+define <8 x half> @set_lane_128(<8 x half> %a, half %b) #0 {
+; CHECK-LABEL: set_lane_128:
+; CHECK: fmov
+; CHECK: ins
+entry:
+  %0 = bitcast half %b to i16
+  %1 = bitcast <8 x half> %a to <8 x i16>
+  %vset_lane = insertelement <8 x i16> %1, i16 %0, i32 2
+  %2 = bitcast <8 x i16> %vset_lane to <8 x half>
+  ret <8 x half> %2
+}
+
+; __fp16 get_lane_64(float16x4_t a) { return vget_lane_f16(a, 2); }
+define half @get_lane_64(<4 x half> %a) #0 {
+; CHECK-LABEL: get_lane_64:
+; CHECK: umov
+; CHECK: fmov
+entry:
+  %0 = bitcast <4 x half> %a to <4 x i16>
+  %vget_lane = extractelement <4 x i16> %0, i32 2
+  %1 = bitcast i16 %vget_lane to half
+  ret half %1
+}
+
+; __fp16 get_lane_128(float16x8_t a) { return vgetq_lane_f16(a, 2); }
+define half @get_lane_128(<8 x half> %a) #0 {
+; CHECK-LABEL: get_lane_128:
+; CHECK: umov
+; CHECK: fmov
+entry:
+  %0 = bitcast <8 x half> %a to <8 x i16>
+  %vgetq_lane = extractelement <8 x i16> %0, i32 2
+  %1 = bitcast i16 %vgetq_lane to half
+  ret half %1
+}
diff --git a/test/CodeGen/AArch64/fpconv-vector-op-scalarize.ll b/test/CodeGen/AArch64/fpconv-vector-op-scalarize.ll
new file mode 100644
index 0000000..56e0b4a
--- /dev/null
+++ b/test/CodeGen/AArch64/fpconv-vector-op-scalarize.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; PR20778
+; Check that the legalizer doesn't crash when scalarizing FP conversion
+; instructions' operands.  The operands are all illegal on AArch64,
+; ensuring they are legalized.  The results are all legal.
+
+define <1 x double> @test_sitofp(<1 x i1> %in) {
+; CHECK-LABEL: test_sitofp:
+; CHECK:       sbfx  [[GPR:w[0-9]+]], w0, #0, #1
+; CHECK-NEXT:  scvtf d0, [[GPR]]
+; CHECK-NEXT:  ret
+entry:
+  %0 = sitofp <1 x i1> %in to <1 x double>
+  ret <1 x double> %0
+}
+
+define <1 x double> @test_uitofp(<1 x i1> %in) {
+; CHECK-LABEL: test_uitofp:
+; CHECK:       and   [[GPR:w[0-9]+]], w0, #0x1
+; CHECK-NEXT:  ucvtf d0, [[GPR]]
+; CHECK-NEXT:  ret
+entry:
+  %0 = uitofp <1 x i1> %in to <1 x double>
+  ret <1 x double> %0
+}
+
+define <1 x i64> @test_fptosi(<1 x fp128> %in) {
+; CHECK-LABEL: test_fptosi:
+; CHECK:       bl    ___fixtfdi
+; CHECK-NEXT:  fmov  d0, x0
+entry:
+  %0 = fptosi <1 x fp128> %in to <1 x i64>
+  ret <1 x i64> %0
+}
+
+define <1 x i64> @test_fptoui(<1 x fp128> %in) {
+; CHECK-LABEL: test_fptoui:
+; CHECK:       bl    ___fixunstfdi
+; CHECK-NEXT:  fmov  d0, x0
+entry:
+  %0 = fptoui <1 x fp128> %in to <1 x i64>
+  ret <1 x i64> %0
+}
diff --git a/test/CodeGen/AArch64/frameaddr.ll b/test/CodeGen/AArch64/frameaddr.ll
index 85d95e2..d6bb50e 100644
--- a/test/CodeGen/AArch64/frameaddr.ll
+++ b/test/CodeGen/AArch64/frameaddr.ll
@@ -1,20 +1,29 @@
-; RUN: llc -o - %s -mtriple=arm64-apple-ios7.0  | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
 
-define i8* @t() nounwind {
+define i8* @test_frameaddress0() nounwind {
 entry:
-; CHECK-LABEL: t:
+; CHECK-LABEL: test_frameaddress0:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
 ; CHECK: mov x0, x29
-	%0 = call i8* @llvm.frameaddress(i32 0)
-        ret i8* %0
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+  %0 = call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
 }
 
-define i8* @t2() nounwind {
+define i8* @test_frameaddress2() nounwind {
 entry:
-; CHECK-LABEL: t2:
+; CHECK-LABEL: test_frameaddress2:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
 ; CHECK: ldr x[[reg:[0-9]+]], [x29]
-; CHECK: ldr {{x[0-9]+}}, [x[[reg]]]
-	%0 = call i8* @llvm.frameaddress(i32 2)
-        ret i8* %0
+; CHECK: ldr x0, [x[[reg]]]
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+  %0 = call i8* @llvm.frameaddress(i32 2)
+  ret i8* %0
 }
 
 declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index 422c576..51979f0 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
 
 %myStruct = type { i64 , i8, i32 }
 
diff --git a/test/CodeGen/AArch64/half.ll b/test/CodeGen/AArch64/half.ll
new file mode 100644
index 0000000..a46094b
--- /dev/null
+++ b/test/CodeGen/AArch64/half.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+define void @test_load_store(half* %in, half* %out) {
+; CHECK-LABEL: test_load_store:
+; CHECK: ldr [[TMP:h[0-9]+]], [x0]
+; CHECK: str [[TMP]], [x1]
+  %val = load half* %in
+  store half %val, half* %out
+  ret void
+}
+
+define i16 @test_bitcast_from_half(half* %addr) {
+; CHECK-LABEL: test_bitcast_from_half:
+; CHECK: ldrh w0, [x0]
+  %val = load half* %addr
+  %val_int = bitcast half %val to i16
+  ret i16 %val_int
+}
+
+define i16 @test_reg_bitcast_from_half(half %in) {
+; CHECK-LABEL: test_reg_bitcast_from_half:
+; CHECK-NOT: str
+; CHECK-NOT: ldr
+; CHECK-DAG: fmov w0, s0
+; CHECK: ret
+  %val = bitcast half %in to i16
+  ret i16 %val
+}
+
+define void @test_bitcast_to_half(half* %addr, i16 %in) {
+; CHECK-LABEL: test_bitcast_to_half:
+; CHECK: strh w1, [x0]
+  %val_fp = bitcast i16 %in to half
+  store half %val_fp, half* %addr
+  ret void
+}
+
+define half @test_reg_bitcast_to_half(i16 %in) {
+; CHECK-LABEL: test_reg_bitcast_to_half:
+; CHECK-NOT: str
+; CHECK-NOT: ldr
+; CHECK-DAG: fmov s0, w0
+; CHECK: ret
+
+  %val = bitcast i16 %in to half
+  ret half %val
+}
+
+define float @test_extend32(half* %addr) {
+; CHECK-LABEL: test_extend32:
+; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}}
+
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to float
+  ret float %val32
+}
+
+define double @test_extend64(half* %addr) {
+; CHECK-LABEL: test_extend64:
+; CHECK: fcvt {{d[0-9]+}}, {{h[0-9]+}}
+
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to double
+  ret double %val32
+}
+
+define void @test_trunc32(float %in, half* %addr) {
+; CHECK-LABEL: test_trunc32:
+; CHECK: fcvt {{h[0-9]+}}, {{s[0-9]+}}
+
+  %val16 = fptrunc float %in to half
+  store half %val16, half* %addr
+  ret void
+}
+
+define void @test_trunc64(double %in, half* %addr) {
+; CHECK-LABEL: test_trunc64:
+; CHECK: fcvt {{h[0-9]+}}, {{d[0-9]+}}
+
+  %val16 = fptrunc double %in to half
+  store half %val16, half* %addr
+  ret void
+}
diff --git a/test/CodeGen/AArch64/hints.ll b/test/CodeGen/AArch64/hints.ll
new file mode 100644
index 0000000..d7d9e23
--- /dev/null
+++ b/test/CodeGen/AArch64/hints.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mtriple aarch64-eabi -o - %s | FileCheck %s
+
+declare void @llvm.aarch64.hint(i32) nounwind
+
+define void @hint_nop() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 0) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_nop
+; CHECK: nop
+
+define void @hint_yield() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 1) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_yield
+; CHECK: yield
+
+define void @hint_wfe() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 2) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_wfe
+; CHECK: wfe
+
+define void @hint_wfi() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 3) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_wfi
+; CHECK: wfi
+
+define void @hint_sev() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 4) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_sev
+; CHECK: sev
+
+define void @hint_sevl() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 5) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_sevl
+; CHECK: sevl
+
+define void @hint_undefined() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 8) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_undefined
+; CHECK: hint #0x8
+
diff --git a/test/CodeGen/AArch64/init-array.ll b/test/CodeGen/AArch64/init-array.ll
index f47b490..a275e7e 100644
--- a/test/CodeGen/AArch64/init-array.ll
+++ b/test/CodeGen/AArch64/init-array.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -o - %s | FileCheck %s
 
 define internal void @_GLOBAL__I_a() section ".text.startup" {
   ret void
diff --git a/test/CodeGen/AArch64/intrinsics-memory-barrier.ll b/test/CodeGen/AArch64/intrinsics-memory-barrier.ll
new file mode 100644
index 0000000..09e34ae
--- /dev/null
+++ b/test/CodeGen/AArch64/intrinsics-memory-barrier.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=aarch64-eabi -O=3 | FileCheck %s
+
+define void @test() {
+  ; CHECK: dmb sy
+  call void @llvm.aarch64.dmb(i32 15)
+  ; CHECK: dmb osh
+  call void @llvm.aarch64.dmb(i32 3)
+  ; CHECK: dsb sy
+  call void @llvm.aarch64.dsb(i32 15)
+  ; CHECK: dsb ishld
+  call void @llvm.aarch64.dsb(i32 9)
+  ; CHECK: isb
+  call void @llvm.aarch64.isb(i32 15)
+  ret void
+}
+
+; Important point is that the compiler should not reorder memory access
+; instructions around DMB.
+; Failure to do so, two STRs will collapse into one STP.
+define void @test_dmb_reordering(i32 %a, i32 %b, i32* %d) {
+  store i32 %a, i32* %d              ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+
+  call void @llvm.aarch64.dmb(i32 15); CHECK: dmb sy
+
+  %d1 = getelementptr i32* %d, i64 1
+  store i32 %b, i32* %d1             ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #4]
+
+  ret void
+}
+
+; Similarly for DSB.
+define void @test_dsb_reordering(i32 %a, i32 %b, i32* %d) {
+  store i32 %a, i32* %d              ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+
+  call void @llvm.aarch64.dsb(i32 15); CHECK: dsb sy
+
+  %d1 = getelementptr i32* %d, i64 1
+  store i32 %b, i32* %d1             ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #4]
+
+  ret void
+}
+
+; And ISB.
+define void @test_isb_reordering(i32 %a, i32 %b, i32* %d) {
+  store i32 %a, i32* %d              ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+
+  call void @llvm.aarch64.isb(i32 15); CHECK: isb
+
+  %d1 = getelementptr i32* %d, i64 1
+  store i32 %b, i32* %d1             ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #4]
+
+  ret void
+}
+
+declare void @llvm.aarch64.dmb(i32)
+declare void @llvm.aarch64.dsb(i32)
+declare void @llvm.aarch64.isb(i32)
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 69fbd99..16682e9 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -56,10 +56,11 @@ lbl4:
 ; CHECK-NEXT: .xword
 
 ; CHECK-PIC-NOT: .data_region
+; CHECK-PIC-NOT: .LJTI0_0
 ; CHECK-PIC: .LJTI0_0:
-; CHECK-PIC-NEXT: .word
-; CHECK-PIC-NEXT: .word
-; CHECK-PIC-NEXT: .word
-; CHECK-PIC-NEXT: .word
-; CHECK-PIC-NEXT: .word
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
 ; CHECK-PIC-NOT: .end_data_region
diff --git a/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll b/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll
new file mode 100644
index 0000000..b785a8f
--- /dev/null
+++ b/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=aarch64 -mcpu=bogus -o - %s
+
+; Fix the bug in PR20557. Set mcpu to a bogus name, llc will crash in type
+; legalization.
+define <4 x float> @fneg4(<4 x float> %x) {
+  %sub = fsub <4 x float> zeroinitializer, %x
+  ret <4 x float> %sub
+}
diff --git a/test/CodeGen/AArch64/machine_cse.ll b/test/CodeGen/AArch64/machine_cse.ll
new file mode 100644
index 0000000..bc9ab10
--- /dev/null
+++ b/test/CodeGen/AArch64/machine_cse.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s
+
+; marked as external to prevent possible optimizations
+@a = external global i32
+@b = external global i32
+@c = external global i32
+@d = external global i32
+@e = external global i32
+
+define void @combine-sign-comparisons-by-cse(i32 *%arg) {
+; CHECK: cmp
+; CHECK: b.ge
+; CHECK-NOT: cmp
+; CHECK: b.le
+
+entry:
+  %a = load i32* @a, align 4
+  %b = load i32* @b, align 4
+  %c = load i32* @c, align 4
+  %d = load i32* @d, align 4
+  %e = load i32* @e, align 4
+
+  %cmp = icmp slt i32 %a, %e
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:
+  %cmp1 = icmp eq i32 %b, %c
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:
+  %cmp2 = icmp sgt i32 %a, %e
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:
+  %cmp4 = icmp eq i32 %b, %d
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  store i32 %a, i32 *%arg
+  ret void
+}
diff --git a/test/CodeGen/AArch64/madd-combiner.ll b/test/CodeGen/AArch64/madd-combiner.ll
new file mode 100644
index 0000000..7c9787a
--- /dev/null
+++ b/test/CodeGen/AArch64/madd-combiner.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=aarch64-apple-darwin            -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s
+
+; Test that we use the correct register class.
+define i32 @mul_add_imm(i32 %a, i32 %b) {
+; CHECK-LABEL: mul_add_imm
+; CHECK:       orr [[REG:w[0-9]+]], wzr, #0x4
+; CHECK-NEXT:  madd  {{w[0-9]+}}, w0, w1, [[REG]]
+  %1 = mul i32 %a, %b
+  %2 = add i32 %1, 4
+  ret i32 %2
+}
+
+define i32 @mul_sub_imm1(i32 %a, i32 %b) {
+; CHECK-LABEL: mul_sub_imm1
+; CHECK:       orr [[REG:w[0-9]+]], wzr, #0x4
+; CHECK-NEXT:  msub  {{w[0-9]+}}, w0, w1, [[REG]]
+  %1 = mul i32 %a, %b
+  %2 = sub i32 4, %1
+  ret i32 %2
+}
+
+; bugpoint reduced test case. This only tests that we pass the MI verifier.
+define void @mul_add_imm2() {
+entry:
+  br label %for.body
+for.body:
+  br i1 undef, label %for.body, label %for.body8
+for.body8:
+  %0 = mul i64 undef, -3
+  %mul1971 = add i64 %0, -3
+  %cmp7 = icmp slt i64 %mul1971, 1390451930000
+  br i1 %cmp7, label %for.body8, label %for.end20
+for.end20:
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/madd-lohi.ll b/test/CodeGen/AArch64/madd-lohi.ll
new file mode 100644
index 0000000..550a8cb
--- /dev/null
+++ b/test/CodeGen/AArch64/madd-lohi.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
+
+define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
+; CHECK-LABEL: test_128bitmul:
+; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
+; CHECK-DAG: madd [[PART1:x[0-9]+]], x0, x3, [[CARRY]]
+; CHECK: madd x1, x1, x2, [[PART1]]
+; CHECK: mul x0, x0, x2
+
+; CHECK-BE-LABEL: test_128bitmul:
+; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
+; CHECK-BE-DAG: madd [[PART1:x[0-9]+]], x1, x2, [[CARRY]]
+; CHECK-BE: madd x0, x0, x3, [[PART1]]
+; CHECK-BE: mul x1, x1, x3
+
+  %prod = mul i128 %lhs, %rhs
+  ret i128 %prod
+}
diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll
index 0689fbd..4515697 100644
--- a/test/CodeGen/AArch64/mul-lohi.ll
+++ b/test/CodeGen/AArch64/mul-lohi.ll
@@ -1,17 +1,16 @@
-; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
-; RUN: llc -mtriple=arm64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
-
+; RUN: llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-linux-gnu -mcpu=cyclone %s -o - | FileCheck --check-prefix=CHECK-BE %s
 define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: test_128bitmul:
+; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3
 ; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
-; CHECK-DAG: madd [[PART1:x[0-9]+]], x0, x3, [[CARRY]]
-; CHECK: madd x1, x1, x2, [[PART1]]
+; CHECK: mul [[PART2:x[0-9]+]], x1, x2
 ; CHECK: mul x0, x0, x2
 
 ; CHECK-BE-LABEL: test_128bitmul:
+; CHECK-BE-DAG: mul [[PART1:x[0-9]+]], x1, x2
 ; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
-; CHECK-BE-DAG: madd [[PART1:x[0-9]+]], x1, x2, [[CARRY]]
-; CHECK-BE: madd x0, x0, x3, [[PART1]]
+; CHECK-BE: mul [[PART2:x[0-9]+]], x0, x3
 ; CHECK-BE: mul x1, x1, x3
 
   %prod = mul i128 %lhs, %rhs
diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
index 4f8571d..41e391d 100644
--- a/test/CodeGen/AArch64/neon-perm.ll
+++ b/test/CodeGen/AArch64/neon-perm.ll
@@ -1387,6 +1387,13 @@ entry:
   ret <8 x i16> %shuffle.i
 }
 
+define <4 x i8> @test_vzip1_v4i8(<8 x i8> %p) {
+; CHECK-LABEL: test_vzip1_v4i8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %lo = shufflevector <8 x i8> %p, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i8> %lo
+}
+
 define <8 x i8> @test_same_vzip2_s8(<8 x i8> %a) {
 ; CHECK-LABEL: test_same_vzip2_s8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll
index a01df32..6afac31 100644
--- a/test/CodeGen/AArch64/neon-scalar-copy.ll
+++ b/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -101,3 +101,20 @@ define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) {
   ret <1 x i64> %vset_lane
 }
 
+; Undefined behaviour, so we really don't care what actually gets emitted, just
+; as long as we don't crash (since it could be dynamically unreachable).
+define i32 @test_out_of_range_extract(<4 x i32> %vec) {
+; CHECK-LABEL: test_out_of_range_extract:
+; CHECK: ret
+  %elt = extractelement <4 x i32> %vec, i32 4
+  ret i32 %elt
+}
+
+; Undefined behaviour, so we really don't care what actually gets emitted, just
+; as long as we don't crash (since it could be dynamically unreachable).
+define void @test_out_of_range_insert(<4 x i32> %vec, i32 %elt) {
+; CHECK-LABEL: test_out_of_range_insert:
+; CHECK: ret
+  insertelement <4 x i32> %vec, i32 %elt, i32 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/paired-load.ll b/test/CodeGen/AArch64/paired-load.ll
new file mode 100644
index 0000000..3dddb9e
--- /dev/null
+++ b/test/CodeGen/AArch64/paired-load.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnu"
+
+; Ensure we're generating ldp instructions instead of ldr Q.
+; CHECK: ldp
+; CHECK: stp
+define void @f(i64* %p, i64* %q) {
+  %addr2 = getelementptr i64* %q, i32 1
+  %addr = getelementptr i64* %p, i32 1
+  %x = load i64* %p
+  %y = load i64* %addr
+  store i64 %x, i64* %q
+  store i64 %y, i64* %addr2
+  ret void
+}
diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll
index e8c7625..93ee0e6 100644
--- a/test/CodeGen/AArch64/pic-eh-stubs.ll
+++ b/test/CodeGen/AArch64/pic-eh-stubs.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
 
 ; Make sure exception-handling PIC code can be linked correctly. An alternative
 ; to the sequence described below would have .gcc_except_table itself writable
diff --git a/test/CodeGen/AArch64/postra-mi-sched.ll b/test/CodeGen/AArch64/postra-mi-sched.ll
new file mode 100644
index 0000000..5a40724
--- /dev/null
+++ b/test/CodeGen/AArch64/postra-mi-sched.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -O3 -march=aarch64 -mcpu=cortex-a53 | FileCheck %s
+
+; With cortex-a53, each of fmul and fcvt have latency of 6 cycles.  After the
+; pre-RA MI scheduler, fmul, fcvt and fdiv will be consecutive.  The top-down
+; post-RA MI scheduler will clean this up.
+
+@d1 = common global double 0.000000e+00, align 8
+
+define i32 @test1(float %s2, float %s3, double %d, i32 %i2, i32 %i3) {
+entry:
+; CHECK-LABEL: @test1
+; CHECK: fmul
+; CHECK-NEXT: add
+; CHECK: fcvt
+; CHECK-NEXT: mul
+  %mul = fmul float %s2, %s3
+  %conv = fpext float %mul to double
+  %div = fdiv double %d, %conv
+  store double %div, double* @d1, align 8
+  %factor = shl i32 %i3, 1
+  %add1 = add i32 %i2, 4
+  %add2 = add i32 %add1, %factor
+  %add3 = add nsw i32 %add2, %i2
+  %add4 = add nsw i32 %add3, %add2
+  %mul5 = mul i32 %add3, %add3
+  %mul6 = mul i32 %mul5, %add4
+  %mul7 = shl i32 %add4, 1
+  %factor18 = mul i32 %mul7, %mul6
+  %add9 = add i32 %factor18, %mul6
+  ret i32 %add9
+}
diff --git a/test/CodeGen/AArch64/rbit.ll b/test/CodeGen/AArch64/rbit.ll
new file mode 100644
index 0000000..3404ae4
--- /dev/null
+++ b/test/CodeGen/AArch64/rbit.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
+
+; CHECK-LABEL: rbit32
+; CHECK: rbit w0, w0
+define i32 @rbit32(i32 %t) {
+entry:
+  %rbit.i = call i32 @llvm.aarch64.rbit.i32(i32 %t)
+  ret i32 %rbit.i
+}
+
+; CHECK-LABEL: rbit64
+; CHECK: rbit x0, x0
+define i64 @rbit64(i64 %t) {
+entry:
+  %rbit.i = call i64 @llvm.aarch64.rbit.i64(i64 %t)
+  ret i64 %rbit.i
+}
+
+declare i64 @llvm.aarch64.rbit.i64(i64)
+declare i32 @llvm.aarch64.rbit.i32(i32)
diff --git a/test/CodeGen/AArch64/rm_redundant_cmp.ll b/test/CodeGen/AArch64/rm_redundant_cmp.ll
new file mode 100644
index 0000000..36dc118
--- /dev/null
+++ b/test/CodeGen/AArch64/rm_redundant_cmp.ll
@@ -0,0 +1,254 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s
+
+; The following cases are for i16
+
+%struct.s_signed_i16 = type { i16, i16, i16 }
+%struct.s_unsigned_i16 = type { i16, i16, i16 }
+
+@cost_s_i8_i16 = common global %struct.s_signed_i16 zeroinitializer, align 2
+@cost_u_i16 = common global %struct.s_unsigned_i16 zeroinitializer, align 2
+
+define void @test_i16_2cmp_signed_1() {
+; CHECK-LABEL: test_i16_2cmp_signed_1
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.gt
+; CHECK-NOT: cmp
+; CHECK: b.ne
+entry:
+  %0 = load i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 1), align 2
+  %1 = load i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 2), align 2
+  %cmp = icmp sgt i16 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i16 %0, i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp eq i16 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i16 %0, i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i16_2cmp_signed_2() {
+; CHECK-LABEL: test_i16_2cmp_signed_2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.le
+; CHECK-NOT: cmp
+; CHECK: b.ge
+entry:
+  %0 = load i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 1), align 2
+  %1 = load i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 2), align 2
+  %cmp = icmp sgt i16 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i16 %0, i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp slt i16 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i16 %1, i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i16_2cmp_unsigned_1() {
+; CHECK-LABEL: test_i16_2cmp_unsigned_1
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.hi
+; CHECK-NOT: cmp
+; CHECK: b.ne
+entry:
+  %0 = load i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 1), align 2
+  %1 = load i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 2), align 2
+  %cmp = icmp ugt i16 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i16 %0, i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp eq i16 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i16 %0, i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i16_2cmp_unsigned_2() {
+; CHECK-LABEL: test_i16_2cmp_unsigned_2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.ls
+; CHECK-NOT: cmp
+; CHECK: b.hs
+entry:
+  %0 = load i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 1), align 2
+  %1 = load i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 2), align 2
+  %cmp = icmp ugt i16 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i16 %0, i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp ult i16 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i16 %1, i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+; The following cases are for i8
+
+%struct.s_signed_i8 = type { i8, i8, i8 }
+%struct.s_unsigned_i8 = type { i8, i8, i8 }
+
+@cost_s = common global %struct.s_signed_i8 zeroinitializer, align 2
+@cost_u_i8 = common global %struct.s_unsigned_i8 zeroinitializer, align 2
+
+
+define void @test_i8_2cmp_signed_1() {
+; CHECK-LABEL: test_i8_2cmp_signed_1
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.gt
+; CHECK-NOT: cmp
+; CHECK: b.ne
+entry:
+  %0 = load i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 1), align 2
+  %1 = load i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 2), align 2
+  %cmp = icmp sgt i8 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i8 %0, i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp eq i8 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i8 %0, i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i8_2cmp_signed_2() {
+; CHECK-LABEL: test_i8_2cmp_signed_2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.le
+; CHECK-NOT: cmp
+; CHECK: b.ge
+entry:
+  %0 = load i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 1), align 2
+  %1 = load i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 2), align 2
+  %cmp = icmp sgt i8 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i8 %0, i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp slt i8 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i8 %1, i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i8_2cmp_unsigned_1() {
+; CHECK-LABEL: test_i8_2cmp_unsigned_1
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.hi
+; CHECK-NOT: cmp
+; CHECK: b.ne
+entry:
+  %0 = load i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 1), align 2
+  %1 = load i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 2), align 2
+  %cmp = icmp ugt i8 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i8 %0, i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp eq i8 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i8 %0, i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i8_2cmp_unsigned_2() {
+; CHECK-LABEL: test_i8_2cmp_unsigned_2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.ls
+; CHECK-NOT: cmp
+; CHECK: b.hs
+entry:
+  %0 = load i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 1), align 2
+  %1 = load i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 2), align 2
+  %cmp = icmp ugt i8 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i8 %0, i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp ult i8 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i8 %1, i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+; Make sure the case below won't crash.
+
+; The optimization of ZERO_EXTEND and SIGN_EXTEND in type legalization stage can't assert
+; the operand of a set_cc is always a TRUNCATE.
+
+define i1 @foo(float %inl, float %inr) {
+  %lval = fptosi float %inl to i8
+  %rval = fptosi float %inr to i8
+  %sum = icmp eq i8 %lval, %rval
+  ret i1 %sum
+}
diff --git a/test/CodeGen/AArch64/sdivpow2.ll b/test/CodeGen/AArch64/sdivpow2.ll
new file mode 100644
index 0000000..6c02ea9
--- /dev/null
+++ b/test/CodeGen/AArch64/sdivpow2.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=arm64-linux-gnu -fast-isel=0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -fast-isel=1 -verify-machineinstrs < %s | FileCheck %s
+
+define i32 @test1(i32 %x) {
+; CHECK-LABEL: test1
+; CHECK: add w8, w0, #7
+; CHECK: cmp w0, #0
+; CHECK: csel w8, w8, w0, lt
+; CHECK: asr w0, w8, #3
+  %div = sdiv i32 %x, 8
+  ret i32 %div
+}
+
+define i32 @test2(i32 %x) {
+; CHECK-LABEL: test2
+; CHECK: add w8, w0, #7
+; CHECK: cmp w0, #0
+; CHECK: csel w8, w8, w0, lt
+; CHECK: neg w0, w8, asr #3
+  %div = sdiv i32 %x, -8
+  ret i32 %div
+}
+
+define i32 @test3(i32 %x) {
+; CHECK-LABEL: test3
+; CHECK: add w8, w0, #31
+; CHECK: cmp w0, #0
+; CHECK: csel w8, w8, w0, lt
+; CHECK: asr w0, w8, #5
+  %div = sdiv i32 %x, 32
+  ret i32 %div
+}
+
+define i64 @test4(i64 %x) {
+; CHECK-LABEL: test4
+; CHECK: add x8, x0, #7
+; CHECK: cmp x0, #0
+; CHECK: csel x8, x8, x0, lt
+; CHECK: asr x0, x8, #3
+  %div = sdiv i64 %x, 8
+  ret i64 %div
+}
+
+define i64 @test5(i64 %x) {
+; CHECK-LABEL: test5
+; CHECK: add x8, x0, #7
+; CHECK: cmp x0, #0
+; CHECK: csel x8, x8, x0, lt
+; CHECK: neg x0, x8, asr #3
+  %div = sdiv i64 %x, -8
+  ret i64 %div
+}
+
+define i64 @test6(i64 %x) {
+; CHECK-LABEL: test6
+; CHECK: add x8, x0, #63
+; CHECK: cmp x0, #0
+; CHECK: csel x8, x8, x0, lt
+; CHECK: asr x0, x8, #6
+  %div = sdiv i64 %x, 64
+  ret i64 %div
+}
+
+define i64 @test7(i64 %x) {
+; CHECK-LABEL: test7
+; CHECK: orr [[REG:x[0-9]+]], xzr, #0xffffffffffff
+; CHECK: add x8, x0, [[REG]]
+; CHECK: cmp x0, #0
+; CHECK: csel x8, x8, x0, lt
+; CHECK: asr x0, x8, #48
+  %div = sdiv i64 %x, 281474976710656
+  ret i64 %div
+}
+
diff --git a/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll b/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
new file mode 100644
index 0000000..bedbf5f
--- /dev/null
+++ b/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -relocation-model=pic | FileCheck %s
+
+@__stack_chk_guard = external global i64*
+
+; PR20558
+
+; CHECK: adrp [[R0:x[0-9]+]], ___stack_chk_guard@GOTPAGE
+; CHECK: ldr  [[R1:x[0-9]+]], {{\[}}[[R0]], ___stack_chk_guard@GOTPAGEOFF{{\]}}
+; CHECK: ldr  [[R2:x[0-9]+]], {{\[}}[[R1]]{{\]}}
+; CHECK: stur [[R2]], {{\[}}x29, [[SLOT0:[0-9#\-]+]]{{\]}}
+; CHECK: ldur [[R3:x[0-9]+]], {{\[}}x29, [[SLOT0]]{{\]}}
+; CHECK: sub  [[R4:x[0-9]+]], [[R2]], [[R3]]
+; CHECK: cbnz [[R4]], LBB
+
+define i32 @test_stack_guard_remat2() {
+entry:
+  %StackGuardSlot = alloca i8*
+  %StackGuard = load i8** bitcast (i64** @__stack_chk_guard to i8**)
+  call void @llvm.stackprotector(i8* %StackGuard, i8** %StackGuardSlot)
+  %container = alloca [32 x i8], align 1
+  call void @llvm.stackprotectorcheck(i8** bitcast (i64** @__stack_chk_guard to i8**))
+  ret i32 -1
+}
+
+declare void @llvm.stackprotector(i8*, i8**)
+declare void @llvm.stackprotectorcheck(i8**)
diff --git a/test/CodeGen/AArch64/stack_guard_remat.ll b/test/CodeGen/AArch64/stack_guard_remat.ll
new file mode 100644
index 0000000..cee7266
--- /dev/null
+++ b/test/CodeGen/AArch64/stack_guard_remat.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=arm64-apple-ios -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC-LINUX
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -relocation-model=static -code-model=large -no-integrated-as | FileCheck %s -check-prefix=STATIC-LARGE
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -relocation-model=static -code-model=small -no-integrated-as | FileCheck %s -check-prefix=STATIC-SMALL
+
+; DARWIN: foo2
+; DARWIN: adrp [[R0:x[0-9]+]], ___stack_chk_guard@GOTPAGE
+; DARWIN: ldr [[R1:x[0-9]+]], {{\[}}[[R0]], ___stack_chk_guard@GOTPAGEOFF{{\]}}
+; DARWIN: ldr {{x[0-9]+}}, {{\[}}[[R1]]{{\]}}
+
+; PIC-LINUX: foo2
+; PIC-LINUX: adrp [[R0:x[0-9]+]], :got:__stack_chk_guard
+; PIC-LINUX: ldr [[R1:x[0-9]+]], {{\[}}[[R0]], :got_lo12:__stack_chk_guard{{\]}}
+; PIC-LINUX: ldr {{x[0-9]+}}, {{\[}}[[R1]]{{\]}}
+
+; STATIC-LARGE: foo2
+; STATIC-LARGE: movz [[R0:x[0-9]+]], #:abs_g3:__stack_chk_guard
+; STATIC-LARGE: movk [[R0]], #:abs_g2_nc:__stack_chk_guard
+; STATIC-LARGE: movk [[R0]], #:abs_g1_nc:__stack_chk_guard
+; STATIC-LARGE: movk [[R0]], #:abs_g0_nc:__stack_chk_guard
+; STATIC-LARGE: ldr {{x[0-9]+}}, {{\[}}[[R0]]{{\]}}
+
+; STATIC-SMALL: foo2
+; STATIC-SMALL: adrp [[R0:x[0-9]+]], __stack_chk_guard
+; STATIC-SMALL: ldr {{x[0-9]+}}, {{\[}}[[R0]], :lo12:__stack_chk_guard{{\]}}
+
+define i32 @test_stack_guard_remat() #0 {
+entry:
+  %a1 = alloca [256 x i32], align 4
+  %0 = bitcast [256 x i32]* %a1 to i8*
+  call void @llvm.lifetime.start(i64 1024, i8* %0)
+  %arraydecay = getelementptr inbounds [256 x i32]* %a1, i64 0, i64 0
+  call void @foo3(i32* %arraydecay)
+  call void asm sideeffect "foo2", "~{w0},~{w1},~{w2},~{w3},~{w4},~{w5},~{w6},~{w7},~{w8},~{w9},~{w10},~{w11},~{w12},~{w13},~{w14},~{w15},~{w16},~{w17},~{w18},~{w19},~{w20},~{w21},~{w22},~{w23},~{w24},~{w25},~{w26},~{w27},~{w28},~{w29},~{w30}"()
+  call void @llvm.lifetime.end(i64 1024, i8* %0)
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
index 8aab842..7fb3954 100644
--- a/test/CodeGen/AArch64/tail-call.ll
+++ b/test/CodeGen/AArch64/tail-call.ll
@@ -3,6 +3,7 @@
 declare fastcc void @callee_stack0()
 declare fastcc void @callee_stack8([8 x i32], i64)
 declare fastcc void @callee_stack16([8 x i32], i64, i64)
+declare extern_weak fastcc void @callee_weak()
 
 define fastcc void @caller_to0_from0() nounwind {
 ; CHECK-LABEL: caller_to0_from0:
@@ -92,3 +93,13 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
 ; CHECK-NEXT: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack16
 }
+
+
+; Weakly-referenced extern functions cannot be tail-called, as AAELF does
+; not define the behaviour of branch instructions to undefined weak symbols.
+define fastcc void @caller_weak() {
+; CHECK-LABEL: caller_weak:
+; CHECK: bl callee_weak
+  tail call void @callee_weak()
+  ret void
+}
diff --git a/test/CodeGen/AArch64/tailcall-fastisel.ll b/test/CodeGen/AArch64/tailcall-fastisel.ll
new file mode 100644
index 0000000..3ba6391
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-fastisel.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -O0 | FileCheck %s
+
+; CHECK: b _foo0
+
+define i32 @foo1() {
+entry:
+  %call = tail call i32 @foo0()
+  ret i32 %call
+}
+
+declare i32 @foo0()
diff --git a/test/CodeGen/AArch64/tbz-tbnz.ll b/test/CodeGen/AArch64/tbz-tbnz.ll
new file mode 100644
index 0000000..c77043c
--- /dev/null
+++ b/test/CodeGen/AArch64/tbz-tbnz.ll
@@ -0,0 +1,258 @@
+; RUN: llc -O1 -march=aarch64 < %s | FileCheck %s
+
+declare void @t()
+
+define void @test1(i32 %a) {
+; CHECK-LABEL: @test1
+entry:
+  %sub = add nsw i32 %a, -12
+  %cmp = icmp slt i32 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:w[0-9]+]], w0, #12
+; CHECK: tbz [[CMP]], #31
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test2(i64 %a) {
+; CHECK-LABEL: @test2
+entry:
+  %sub = add nsw i64 %a, -12
+  %cmp = icmp slt i64 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:x[0-9]+]], x0, #12
+; CHECK: tbz [[CMP]], #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test3(i32 %a) {
+; CHECK-LABEL: @test3
+entry:
+  %sub = add nsw i32 %a, -12
+  %cmp = icmp sgt i32 %sub, -1
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:w[0-9]+]], w0, #12
+; CHECK: tbnz [[CMP]], #31
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test4(i64 %a) {
+; CHECK-LABEL: @test4
+entry:
+  %sub = add nsw i64 %a, -12
+  %cmp = icmp sgt i64 %sub, -1
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:x[0-9]+]], x0, #12
+; CHECK: tbnz [[CMP]], #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test5(i32 %a) {
+; CHECK-LABEL: @test5
+entry:
+  %sub = add nsw i32 %a, -12
+  %cmp = icmp sge i32 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:w[0-9]+]], w0, #12
+; CHECK: tbnz [[CMP]], #31
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test6(i64 %a) {
+; CHECK-LABEL: @test6
+entry:
+  %sub = add nsw i64 %a, -12
+  %cmp = icmp sge i64 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:x[0-9]+]], x0, #12
+; CHECK: tbnz [[CMP]], #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test7(i32 %a) {
+; CHECK-LABEL: @test7
+entry:
+  %sub = sub nsw i32 %a, 12
+  %cmp = icmp slt i32 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:w[0-9]+]], w0, #12
+; CHECK: tbz [[CMP]], #31
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test8(i64 %val1, i64 %val2, i64 %val3) {
+; CHECK-LABEL: @test8
+  %and1 = and i64 %val1, %val2
+  %tst1 = icmp slt i64 %and1, 0
+  br i1 %tst1, label %if.then1, label %if.end
+
+; CHECK: tst x0, x1
+; CHECK-NEXT: b.ge
+
+if.then1:
+  %and2 = and i64 %val2, %val3
+  %tst2 = icmp sge i64 %and2, 0
+  br i1 %tst2, label %if.then2, label %if.end
+
+; CHECK: and [[CMP:x[0-9]+]], x1, x2
+; CHECK-NOT: cmp
+; CHECK: tbnz [[CMP]], #63
+
+if.then2:
+  %shifted_op1 = shl i64 %val2, 63
+  %shifted_and1 = and i64 %val1, %shifted_op1
+  %tst3 = icmp slt i64 %shifted_and1, 0
+  br i1 %tst3, label %if.then3, label %if.end
+
+; CHECK: tst x0, x1, lsl #63
+; CHECK: b.ge
+
+if.then3:
+  %shifted_op2 = shl i64 %val2, 62
+  %shifted_and2 = and i64 %val1, %shifted_op2
+  %tst4 = icmp sge i64 %shifted_and2, 0
+  br i1 %tst4, label %if.then4, label %if.end
+
+; CHECK: tst x0, x1, lsl #62
+; CHECK: b.lt
+
+if.then4:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test9(i64 %val1) {
+; CHECK-LABEL: @test9
+  %tst = icmp slt i64 %val1, 0
+  br i1 %tst, label %if.then, label %if.end
+
+; CHECK-NOT: cmp
+; CHECK: tbz x0, #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test10(i64 %val1) {
+; CHECK-LABEL: @test10
+  %tst = icmp slt i64 %val1, 0
+  br i1 %tst, label %if.then, label %if.end
+
+; CHECK-NOT: cmp
+; CHECK: tbz x0, #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test11(i64 %val1, i64* %ptr) {
+; CHECK-LABEL: @test11
+
+; CHECK: ldr [[CMP:x[0-9]+]], [x1]
+; CHECK-NOT: cmp
+; CHECK: tbz [[CMP]], #63
+
+  %val = load i64* %ptr
+  %tst = icmp slt i64 %val, 0
+  br i1 %tst, label %if.then, label %if.end
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test12(i64 %val1) {
+; CHECK-LABEL: @test12
+  %tst = icmp slt i64 %val1, 0
+  br i1 %tst, label %if.then, label %if.end
+
+; CHECK-NOT: cmp
+; CHECK: tbz x0, #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test13(i64 %val1, i64 %val2) {
+; CHECK-LABEL: @test13
+  %or = or i64 %val1, %val2
+  %tst = icmp slt i64 %or, 0
+  br i1 %tst, label %if.then, label %if.end
+
+; CHECK: orr [[CMP:x[0-9]+]], x0, x1
+; CHECK-NOT: cmp
+; CHECK: tbz [[CMP]], #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/trunc-v1i64.ll b/test/CodeGen/AArch64/trunc-v1i64.ll
index 159b8e0..19efd2f 100644
--- a/test/CodeGen/AArch64/trunc-v1i64.ll
+++ b/test/CodeGen/AArch64/trunc-v1i64.ll
@@ -60,4 +60,23 @@ define <8 x i8> @test_v1i8_1(<1 x i64> %in0) {
   %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %2 = trunc <8 x i64> %1 to <8 x i8>
   ret <8 x i8> %2
-}
-\ No newline at end of file
+}
+
+; PR20777: v1i1 is also problematic, but we can't widen it, so we extract_elt
+; the i64 out of the v1i64 operand, and truncate that scalar instead.
+
+define <1 x i1> @test_v1i1_0(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i1_0:
+; CHECK: fmov w0, s0
+  %1 = trunc <1 x i64> %in0 to <1 x i1>
+  ret <1 x i1> %1
+}
+
+define i1 @test_v1i1_1(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i1_1:
+; CHECK: fmov [[REG:w[0-9]+]], s0
+  %1 = trunc <1 x i64> %in0 to <1 x i1>
+; CHECK: and w0, [[REG]], #0x1
+  %2 = extractelement <1 x i1> %1, i32 0
+  ret i1 %2
+}
diff --git a/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll b/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll
index 55cea3a..90a3b37 100644
--- a/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll
+++ b/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll
@@ -1,11 +1,14 @@
-; RUN: llc < %s -march=arm -enable-tail-merge | grep bl.*baz | count 1
-; RUN: llc < %s -march=arm -enable-tail-merge | grep bl.*quux | count 1
+; RUN: llc < %s -enable-tail-merge | FileCheck %s
 ; Check that calls to baz and quux are tail-merged.
 ; PR1628
 
+; CHECK: bl _baz
+; CHECK-NOT: bl _baz
+; CHECK: bl _quux
+; CHECK-NOT: bl _quux
+
 ; ModuleID = 'tail.c'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
-target triple = "i686-apple-darwin8"
+target triple = "arm-apple-darwin8"
 
 define i32 @f(i32 %i, i32 %q) {
 entry:
diff --git a/test/CodeGen/ARM/2009-10-16-Scope.ll b/test/CodeGen/ARM/2009-10-16-Scope.ll
index 570fcf9..b4e758d 100644
--- a/test/CodeGen/ARM/2009-10-16-Scope.ll
+++ b/test/CodeGen/ARM/2009-10-16-Scope.ll
@@ -9,7 +9,7 @@ entry:
   br label %do.body, !dbg !0
 
 do.body:                                          ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4)
+  call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4, metadata !{metadata !"0x102"})
   %conv = ptrtoint i32* %count_ to i32, !dbg !0   ; <i32> [#uses=1]
   %call = call i32 @foo(i32 %conv) ssp, !dbg !0   ; <i32> [#uses=0]
   br label %do.end, !dbg !0
@@ -18,17 +18,17 @@ do.end:                                           ; preds = %do.body
   ret void, !dbg !7
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i32 @foo(i32) ssp
 
 !0 = metadata !{i32 5, i32 2, metadata !1, null}
-!1 = metadata !{i32 458763, null, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0}; [DW_TAG_subprogram ]
-!3 = metadata !{i32 458769, metadata !8, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, null, metadata !9, null, null, null, metadata !""}; [DW_TAG_compile_unit ]
-!4 = metadata !{i32 459008, metadata !5, metadata !"count_", metadata !3, i32 5, metadata !6}; [ DW_TAG_auto_variable ]
-!5 = metadata !{i32 458763, null, metadata !1, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!6 = metadata !{i32 458788, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}; [DW_TAG_base_type ]
+!1 = metadata !{metadata !"0xb\001\001\000", null, metadata !2}; [DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", i32 0, metadata !3, null, null, null, null, null, null}; [DW_TAG_subprogram ]
+!3 = metadata !{metadata !"0x11\0012\00clang 1.1\001\00\000\00\000", metadata !8, null, metadata !9, null, null, null}; [DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x100\00count_\005\000", metadata !5, metadata !3, metadata !6}; [ DW_TAG_auto_variable ]
+!5 = metadata !{metadata !"0xb\001\001\000", null, metadata !1}; [DW_TAG_lexical_block ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3}; [DW_TAG_base_type ]
 !7 = metadata !{i32 6, i32 1, metadata !2, null}
 !8 = metadata !{metadata !"genmodes.i", metadata !"/Users/yash/Downloads"}
 !9 = metadata !{i32 0}
diff --git a/test/CodeGen/ARM/2009-10-21-InvalidFNeg.ll b/test/CodeGen/ARM/2009-10-21-InvalidFNeg.ll
deleted file mode 100644
index 0f021d2..0000000
--- a/test/CodeGen/ARM/2009-10-21-InvalidFNeg.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc -mcpu=cortex-a8 -mattr=+neon < %s | grep vneg
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
-target triple = "armv7-eabi"
-
-%aaa = type { %fff, %fff }
-%bbb = type { [6 x %ddd] }
-%ccc = type { %eee, %fff }
-%ddd = type { %fff }
-%eee = type { %fff, %fff, %fff, %fff }
-%fff = type { %struct.vec_float4 }
-%struct.vec_float4 = type { <4 x float> }
-
-define linkonce_odr arm_aapcs_vfpcc void @foo(%eee* noalias sret %agg.result, i64 %tfrm.0.0, i64 %tfrm.0.1, i64 %tfrm.0.2, i64 %tfrm.0.3, i64 %tfrm.0.4, i64 %tfrm.0.5, i64 %tfrm.0.6, i64 %tfrm.0.7) nounwind noinline {
-entry:
-  %tmp104 = zext i64 %tfrm.0.2 to i512            ; <i512> [#uses=1]
-  %tmp105 = shl i512 %tmp104, 128                 ; <i512> [#uses=1]
-  %tmp118 = zext i64 %tfrm.0.3 to i512            ; <i512> [#uses=1]
-  %tmp119 = shl i512 %tmp118, 192                 ; <i512> [#uses=1]
-  %ins121 = or i512 %tmp119, %tmp105              ; <i512> [#uses=1]
-  %tmp99 = zext i64 %tfrm.0.4 to i512             ; <i512> [#uses=1]
-  %tmp100 = shl i512 %tmp99, 256                  ; <i512> [#uses=1]
-  %tmp123 = zext i64 %tfrm.0.5 to i512            ; <i512> [#uses=1]
-  %tmp124 = shl i512 %tmp123, 320                 ; <i512> [#uses=1]
-  %tmp96 = zext i64 %tfrm.0.6 to i512             ; <i512> [#uses=1]
-  %tmp97 = shl i512 %tmp96, 384                   ; <i512> [#uses=1]
-  %tmp128 = zext i64 %tfrm.0.7 to i512            ; <i512> [#uses=1]
-  %tmp129 = shl i512 %tmp128, 448                 ; <i512> [#uses=1]
-  %mask.masked = or i512 %tmp124, %tmp100         ; <i512> [#uses=1]
-  %ins131 = or i512 %tmp129, %tmp97               ; <i512> [#uses=1]
-  %tmp109132 = zext i64 %tfrm.0.0 to i128         ; <i128> [#uses=1]
-  %tmp113134 = zext i64 %tfrm.0.1 to i128         ; <i128> [#uses=1]
-  %tmp114133 = shl i128 %tmp113134, 64            ; <i128> [#uses=1]
-  %tmp94 = or i128 %tmp114133, %tmp109132         ; <i128> [#uses=1]
-  %tmp95 = bitcast i128 %tmp94 to <4 x float>     ; <<4 x float>> [#uses=0]
-  %tmp82 = lshr i512 %ins121, 128                 ; <i512> [#uses=1]
-  %tmp83 = trunc i512 %tmp82 to i128              ; <i128> [#uses=1]
-  %tmp84 = bitcast i128 %tmp83 to <4 x float>     ; <<4 x float>> [#uses=0]
-  %tmp86 = lshr i512 %mask.masked, 256            ; <i512> [#uses=1]
-  %tmp87 = trunc i512 %tmp86 to i128              ; <i128> [#uses=1]
-  %tmp88 = bitcast i128 %tmp87 to <4 x float>     ; <<4 x float>> [#uses=0]
-  %tmp90 = lshr i512 %ins131, 384                 ; <i512> [#uses=1]
-  %tmp91 = trunc i512 %tmp90 to i128              ; <i128> [#uses=1]
-  %tmp92 = bitcast i128 %tmp91 to <4 x float>     ; <<4 x float>> [#uses=1]
-  %tmp = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %tmp92 ; <<4 x float>> [#uses=1]
-  %tmp28 = getelementptr inbounds %eee* %agg.result, i32 0, i32 3, i32 0, i32 0 ; <<4 x float>*> [#uses=1]
-  store <4 x float> %tmp, <4 x float>* %tmp28, align 16
-  ret void
-}
diff --git a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
index 35739d7..bce3120 100644
--- a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
+++ b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
@@ -5,28 +5,28 @@ target triple = "armv4t-apple-darwin10"
 
 define hidden i32 @__addvsi3(i32 %a, i32 %b) nounwind {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %b}, i64 0, metadata !0)
+  tail call void @llvm.dbg.value(metadata !{i32 %b}, i64 0, metadata !0, metadata !{metadata !"0x102"})
   %0 = add nsw i32 %b, %a, !dbg !9                ; <i32> [#uses=1]
   ret i32 %0, !dbg !11
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!15}
-!0 = metadata !{i32 524545, metadata !1, metadata !"b", metadata !2, i32 93, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 524334, metadata !12, null, metadata !"__addvsi3", metadata !"__addvsi3", metadata !"__addvsi3", i32 94, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 524329, metadata !12} ; [ DW_TAG_file_type ]
+!0 = metadata !{metadata !"0x101\00b\0093\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00__addvsi3\00__addvsi3\00__addvsi3\0094\000\001\000\006\000\000\000", metadata !12, null, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
 !12 = metadata !{metadata !"libgcc2.c", metadata !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc"}
-!3 = metadata !{i32 524305, metadata !12, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", i1 true, metadata !"", i32 0, metadata !13, metadata !13, metadata !14, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 524309, metadata !12, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)\001\00\000\00\000", metadata !12, metadata !13, metadata !13, metadata !14, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !6, metadata !6}
-!6 = metadata !{i32 524310, metadata !12, null, metadata !"SItype", i32 152, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ]
-!7 = metadata !{i32 524329, metadata !"libgcc2.h", metadata !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc", metadata !3} ; [ DW_TAG_file_type ]
-!8 = metadata !{i32 524324, metadata !12, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x16\00SItype\00152\000\000\000\000", metadata !12, null, metadata !8} ; [ DW_TAG_typedef ]
+!7 = metadata !{metadata !"0x29", metadata !"libgcc2.h", metadata !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc", metadata !3} ; [ DW_TAG_file_type ]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !12, metadata !2} ; [ DW_TAG_base_type ]
 !9 = metadata !{i32 95, i32 0, metadata !10, null}
-!10 = metadata !{i32 524299, metadata !12, metadata !1, i32 94, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0xb\0094\000\000", metadata !12, metadata !1} ; [ DW_TAG_lexical_block ]
 !11 = metadata !{i32 100, i32 0, metadata !10, null}
 !13 = metadata !{i32 0}
 !14 = metadata !{metadata !1}
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll b/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
index a53200e..efe1ab5 100644
--- a/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
+++ b/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
@@ -7,16 +7,16 @@ target triple = "thumbv7-apple-darwin3.0.0-iphoneos"
 
 define void @x0(i8* nocapture %buf, i32 %nbytes) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %buf}, i64 0, metadata !0), !dbg !15
-  tail call void @llvm.dbg.value(metadata !{i32 %nbytes}, i64 0, metadata !8), !dbg !16
+  tail call void @llvm.dbg.value(metadata !{i8* %buf}, i64 0, metadata !0, metadata !{metadata !"0x102"}), !dbg !15
+  tail call void @llvm.dbg.value(metadata !{i32 %nbytes}, i64 0, metadata !8, metadata !{metadata !"0x102"}), !dbg !16
   %tmp = load i32* @length, !dbg !17              ; <i32> [#uses=3]
   %cmp = icmp eq i32 %tmp, -1, !dbg !17           ; <i1> [#uses=1]
   %cmp.not = xor i1 %cmp, true                    ; <i1> [#uses=1]
   %cmp3 = icmp ult i32 %tmp, %nbytes, !dbg !17    ; <i1> [#uses=1]
   %or.cond = and i1 %cmp.not, %cmp3               ; <i1> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{i32 %tmp}, i64 0, metadata !8), !dbg !17
+  tail call void @llvm.dbg.value(metadata !{i32 %tmp}, i64 0, metadata !8, metadata !{metadata !"0x102"}), !dbg !17
   %nbytes.addr.0 = select i1 %or.cond, i32 %tmp, i32 %nbytes ; <i32> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !18, i64 0, metadata !10), !dbg !19
+  tail call void @llvm.dbg.value(metadata !18, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !19
   br label %while.cond, !dbg !20
 
 while.cond:                                       ; preds = %while.body, %entry
@@ -42,26 +42,26 @@ while.end:                                        ; preds = %land.rhs, %while.co
 
 declare i32 @x1() optsize
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.lv.fn = !{!0, !8, !10, !12}
 !llvm.dbg.gv = !{!14}
 
-!0 = metadata !{i32 524545, metadata !1, metadata !"buf", metadata !2, i32 4, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 524334, metadata !26, null, metadata !"x0", metadata !"x0", metadata !"x0", i32 5, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 524329, metadata !26} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 524305, i32 0, i32 12, metadata !"t.c", metadata !".", metadata !"clang 2.0", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 524309, metadata !26, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!0 = metadata !{metadata !"0x101\00buf\004\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00x0\00x0\00x0\005\000\001\000\006\000\000\000", metadata !26, null, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\0012\00clang 2.0\001\00\00\00\00", metadata !26, null, null, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !26, metadata !2, null, metadata !5, null} ; [ DW_TAG_subroutine_type ]
 !5 = metadata !{null}
-!6 = metadata !{i32 524303, metadata !26, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 524324, metadata !26, metadata !2, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 524545, metadata !1, metadata !"nbytes", metadata !2, i32 4, metadata !9} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 524324, metadata !26, metadata !2, metadata !"unsigned long", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 524544, metadata !11, metadata !"nread", metadata !2, i32 6, metadata !9} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 524299, metadata !26, metadata !1, i32 5, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 524544, metadata !11, metadata !"c", metadata !2, i32 7, metadata !13} ; [ DW_TAG_auto_variable ]
-!13 = metadata !{i32 524324, metadata !26, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 524340, i32 0, metadata !2, metadata !"length", metadata !"length", metadata !"length", metadata !2, i32 1, metadata !13, i1 false, i1 true, i32* @length} ; [ DW_TAG_variable ]
+!6 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !26, metadata !2, metadata !7} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", metadata !26, metadata !2} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x101\00nbytes\004\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x24\00unsigned long\000\0032\0032\000\000\007", metadata !26, metadata !2} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x100\00nread\006\000", metadata !11, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{metadata !"0xb\005\001\000", metadata !26, metadata !1} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{metadata !"0x100\00c\007\000", metadata !11, metadata !2, metadata !13} ; [ DW_TAG_auto_variable ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !26, metadata !2} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x34\00length\00length\00length\001\000\001", metadata !2, metadata !2, metadata !13, i32* @length} ; [ DW_TAG_variable ]
 !15 = metadata !{i32 4, i32 24, metadata !1, null}
 !16 = metadata !{i32 4, i32 43, metadata !1, null}
 !17 = metadata !{i32 9, i32 2, metadata !11, null}
@@ -69,7 +69,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !19 = metadata !{i32 10, i32 2, metadata !11, null}
 !20 = metadata !{i32 11, i32 2, metadata !11, null}
 !21 = metadata !{i32 12, i32 3, metadata !22, null}
-!22 = metadata !{i32 524299, metadata !26, metadata !11, i32 11, i32 45, i32 0} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xb\0011\0045\000", metadata !26, metadata !11} ; [ DW_TAG_lexical_block ]
 !23 = metadata !{i32 13, i32 3, metadata !22, null}
 !24 = metadata !{i32 14, i32 2, metadata !22, null}
 !25 = metadata !{i32 15, i32 1, metadata !11, null}
diff --git a/test/CodeGen/ARM/2010-08-04-StackVariable.ll b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
index 48de244..f10408c 100644
--- a/test/CodeGen/ARM/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
@@ -6,8 +6,8 @@
 define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23), !dbg !24
-  call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25), !dbg !24
+  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !24
   %0 = icmp ne i32 %i, 0, !dbg !27                ; <i1> [#uses=1]
   br i1 %0, label %bb, label %bb1, !dbg !27
 
@@ -34,7 +34,7 @@ return:                                           ; preds = %bb2
 define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2  {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31), !dbg !34
+  call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !34
   %0 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 0, !dbg !34 ; <i8**> [#uses=1]
   store i8* null, i8** %0, align 8, !dbg !34
   %1 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 1, !dbg !34 ; <i32*> [#uses=1]
@@ -45,14 +45,14 @@ return:                                           ; preds = %entry
   ret void, !dbg !35
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @main() nounwind ssp {
 entry:
   %0 = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=3]
   %v = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=4]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38), !dbg !41
+  call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38, metadata !{metadata !"0x102"}), !dbg !41
   call void @_ZN4SValC1Ev(%struct.SVal* %v) nounwind, !dbg !41
   %1 = getelementptr inbounds %struct.SVal* %v, i32 0, i32 1, !dbg !42 ; <i32*> [#uses=1]
   store i32 1, i32* %1, align 8, !dbg !42
@@ -65,65 +65,65 @@ entry:
   %7 = load i32* %6, align 8, !dbg !43            ; <i32> [#uses=1]
   store i32 %7, i32* %5, align 8, !dbg !43
   %8 = call i32 @_Z3fooi4SVal(i32 2, %struct.SVal* noalias %0) nounwind, !dbg !43 ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44), !dbg !43
+  call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44, metadata !{metadata !"0x102"}), !dbg !43
   br label %return, !dbg !45
 
 return:                                           ; preds = %entry
   ret i32 0, !dbg !45
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!49}
 
-!0 = metadata !{i32 786478, metadata !48, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786451, metadata !48, null, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
-!2 = metadata !{i32 786473, metadata !48} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !48, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !47, metadata !47, metadata !46, metadata !47,  metadata !47, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x2e\00SVal\00SVal\00\0011\000\000\000\006\000\000\000", metadata !48, metadata !1, metadata !14, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x13\00SVal\001\00128\0064\000\000\000", metadata !48, null, null, metadata !4, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
+!2 = metadata !{metadata !"0x29", metadata !48} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\001", metadata !48, metadata !47, metadata !47, metadata !46, metadata !47,  metadata !47} ; [ DW_TAG_compile_unit ]
 !4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9}
-!5 = metadata !{i32 786445, metadata !48, metadata !1, metadata !"Data", i32 7, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!6 = metadata !{i32 786447, metadata !48, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 786445, metadata !48, metadata !1, metadata !"Kind", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ]
-!8 = metadata !{i32 786468, metadata !48, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786478, metadata !48, metadata !1, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 12, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0xd\00Data\007\0064\0064\000\000", metadata !48, metadata !1, metadata !6} ; [ DW_TAG_member ]
+!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !48, null, null} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0xd\00Kind\008\0032\0032\0064\000", metadata !48, metadata !1, metadata !8} ; [ DW_TAG_member ]
+!8 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !48, null} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00\0012\000\000\000\006\000\000\000", metadata !48, metadata !1, metadata !10, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !48, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{null, metadata !12, metadata !13}
-!12 = metadata !{i32 786447, metadata !48, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{i32 786468, metadata !48, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !48, null, metadata !1} ; [ DW_TAG_pointer_type ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !48, null} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !48, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !12}
-!16 = metadata !{i32 786478, metadata !48, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"_ZN4SValC1Ev", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 786478, metadata !48, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3fooi4SVal", i32 16, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!18 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0x2e\00SVal\00SVal\00_ZN4SValC1Ev\0011\000\001\000\006\000\000\000", metadata !48, metadata !1, metadata !14, null, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi4SVal\0016\000\001\000\006\000\000\000", metadata !48, metadata !2, metadata !18, null, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null} ; [ DW_TAG_subprogram ]
+!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !48, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{metadata !13, metadata !13, metadata !1}
-!20 = metadata !{i32 786478, metadata !48, metadata !2, metadata !"main", metadata !"main", metadata !"main", i32 23, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!21 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = metadata !{metadata !"0x2e\00main\00main\00main\0023\000\001\000\006\000\000\000", metadata !48, metadata !2, metadata !21, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !48, null, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !22 = metadata !{metadata !13}
-!23 = metadata !{i32 786689, metadata !17, metadata !"i", metadata !2, i32 16, metadata !13, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!23 = metadata !{metadata !"0x101\00i\0016\000", metadata !17, metadata !2, metadata !13} ; [ DW_TAG_arg_variable ]
 !24 = metadata !{i32 16, i32 0, metadata !17, null}
-!25 = metadata !{i32 786689, metadata !17, metadata !"location", metadata !2, i32 16, metadata !26, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!26 = metadata !{i32 786448, metadata !48, metadata !2, metadata !"SVal", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_reference_type ]
+!25 = metadata !{metadata !"0x101\00location\0016\000", metadata !17, metadata !2, metadata !26} ; [ DW_TAG_arg_variable ]
+!26 = metadata !{metadata !"0x10\00SVal\000\0064\0064\000\000", metadata !48, metadata !2, metadata !1} ; [ DW_TAG_reference_type ]
 !27 = metadata !{i32 17, i32 0, metadata !28, null}
-!28 = metadata !{i32 786443, metadata !2, metadata !17, i32 16, i32 0, i32 2} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0xb\0016\000\002", metadata !2, metadata !17} ; [ DW_TAG_lexical_block ]
 !29 = metadata !{i32 18, i32 0, metadata !28, null}
 !30 = metadata !{i32 20, i32 0, metadata !28, null}
-!31 = metadata !{i32 786689, metadata !16, metadata !"this", metadata !2, i32 11, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!32 = metadata !{i32 786470, metadata !48, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !33} ; [ DW_TAG_const_type ]
-!33 = metadata !{i32 786447, metadata !48, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_pointer_type ]
+!31 = metadata !{metadata !"0x101\00this\0011\000", metadata !16, metadata !2, metadata !32} ; [ DW_TAG_arg_variable ]
+!32 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !48, metadata !2, metadata !33} ; [ DW_TAG_const_type ]
+!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !48, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ]
 !34 = metadata !{i32 11, i32 0, metadata !16, null}
 !35 = metadata !{i32 11, i32 0, metadata !36, null}
-!36 = metadata !{i32 786443, metadata !48, metadata !37, i32 11, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
-!37 = metadata !{i32 786443, metadata !48, metadata !16, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!38 = metadata !{i32 786688, metadata !39, metadata !"v", metadata !2, i32 24, metadata !1, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!39 = metadata !{i32 786443, metadata !48, metadata !40, i32 23, i32 0, i32 4} ; [ DW_TAG_lexical_block ]
-!40 = metadata !{i32 786443, metadata !48, metadata !20, i32 23, i32 0, i32 3} ; [ DW_TAG_lexical_block ]
+!36 = metadata !{metadata !"0xb\0011\000\001", metadata !48, metadata !37} ; [ DW_TAG_lexical_block ]
+!37 = metadata !{metadata !"0xb\0011\000\000", metadata !48, metadata !16} ; [ DW_TAG_lexical_block ]
+!38 = metadata !{metadata !"0x100\00v\0024\000", metadata !39, metadata !2, metadata !1} ; [ DW_TAG_auto_variable ]
+!39 = metadata !{metadata !"0xb\0023\000\004", metadata !48, metadata !40} ; [ DW_TAG_lexical_block ]
+!40 = metadata !{metadata !"0xb\0023\000\003", metadata !48, metadata !20} ; [ DW_TAG_lexical_block ]
 !41 = metadata !{i32 24, i32 0, metadata !39, null}
 !42 = metadata !{i32 25, i32 0, metadata !39, null}
 !43 = metadata !{i32 26, i32 0, metadata !39, null}
-!44 = metadata !{i32 786688, metadata !39, metadata !"k", metadata !2, i32 26, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
+!44 = metadata !{metadata !"0x100\00k\0026\000", metadata !39, metadata !2, metadata !13} ; [ DW_TAG_auto_variable ]
 !45 = metadata !{i32 27, i32 0, metadata !39, null}
 !46 = metadata !{metadata !16, metadata !17, metadata !20}
 !47 = metadata !{}
 !48 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/2010-11-15-SpillEarlyClobber.ll b/test/CodeGen/ARM/2010-11-15-SpillEarlyClobber.ll
index ec74880..80a1964 100644
--- a/test/CodeGen/ARM/2010-11-15-SpillEarlyClobber.ll
+++ b/test/CodeGen/ARM/2010-11-15-SpillEarlyClobber.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -spiller=trivial
-; RUN: llc < %s -verify-machineinstrs -spiller=inline
+; RUN: llc < %s -verify-machineinstrs
 ; PR8612
 ;
 ; This test has an inline asm with early-clobber arguments.
diff --git a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
index b1d59aa..7fbd3ba 100644
--- a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-darwin10"
@@ -11,66 +11,66 @@ target triple = "thumbv7-apple-darwin10"
 
 ; Check debug info output for merged global.
 ; DW_AT_location
-; DW_OP_addr
-; DW_OP_plus
-; .long __MergedGlobals
-; DW_OP_constu
-; offset
+; 0x03 DW_OP_addr
+; 0x.. .long __MergedGlobals
+; 0x10 DW_OP_constu
+; 0x.. offset
+; 0x22 DW_OP_plus
 
-;CHECK: .long Lset7
-;CHECK-NEXT:        @ DW_AT_type
-;CHECK-NEXT:        @ DW_AT_decl_file
-;CHECK-NEXT:        @ DW_AT_decl_line
-;CHECK-NEXT:        @ DW_AT_location
-;CHECK-NEXT:        .byte   3
-;CHECK-NEXT:        .long   __MergedGlobals
-;CHECK-NEXT:        .byte   16
-;CHECK-NEXT:        .byte   1
-;CHECK-NEXT:        .byte   34
+; CHECK: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:    DW_AT_name {{.*}} "x1"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:    DW_AT_location [DW_FORM_exprloc]        (<0x8> 03 [[ADDR:.. .. .. ..]] 10 00 22  )
+; CHECK: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:    DW_AT_name {{.*}} "x2"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:    DW_AT_location [DW_FORM_exprloc]        (<0x8> 03 [[ADDR]] 10 01 22  )
 
 define zeroext i8 @get1(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !10), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !30
   %0 = load i8* @x1, align 4, !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !11), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !30
   store i8 %a, i8* @x1, align 4, !dbg !30
   ret i8 %0, !dbg !31
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 define zeroext i8 @get2(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !18), !dbg !32
+  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !32
   %0 = load i8* @x2, align 4, !dbg !32
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !19), !dbg !32
+  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !32
   store i8 %a, i8* @x2, align 4, !dbg !32
   ret i8 %0, !dbg !33
 }
 
 define zeroext i8 @get3(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !21), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !34
   %0 = load i8* @x3, align 4, !dbg !34
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !22), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !34
   store i8 %a, i8* @x3, align 4, !dbg !34
   ret i8 %0, !dbg !35
 }
 
 define zeroext i8 @get4(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !24), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !24, metadata !{metadata !"0x102"}), !dbg !36
   %0 = load i8* @x4, align 4, !dbg !36
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !25), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !36
   store i8 %a, i8* @x4, align 4, !dbg !36
   ret i8 %0, !dbg !37
 }
 
 define zeroext i8 @get5(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !27), !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !38
   %0 = load i8* @x5, align 4, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !28), !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !38
   store i8 %a, i8* @x5, align 4, !dbg !38
   ret i8 %0, !dbg !39
 }
@@ -78,36 +78,36 @@ entry:
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!49}
 
-!0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get1", metadata !"get1", metadata !"get1", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get1, null, null, metadata !42, i32 4} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !47, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)", i1 true, metadata !"", i32 0, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !48, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !47, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00get1\00get1\00get1\004\000\001\000\006\00256\001\004", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get1, null, null, metadata !42} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !47} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)\001\00\000\00\000", metadata !47, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !48} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !5}
-!5 = metadata !{i32 786468, metadata !47, metadata !1, metadata !"_Bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get2", metadata !"get2", metadata !"get2", i32 7, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get2, null, null, metadata !43, i32 7} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get3", metadata !"get3", metadata !"get3", i32 10, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get3, null, null, metadata !44, i32 10} ; [ DW_TAG_subprogram ]
-!8 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get4", metadata !"get4", metadata !"get4", i32 13, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get4, null, null, metadata !45, i32 13} ; [ DW_TAG_subprogram ]
-!9 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get5", metadata !"get5", metadata !"get5", i32 16, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get5, null, null, metadata !46, i32 16} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 4, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{i32 786688, metadata !12, metadata !"b", metadata !1, i32 4, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!12 = metadata !{i32 786443, metadata !47, metadata !0, i32 4, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!13 = metadata !{i32 786484, i32 0, metadata !1, metadata !"x1", metadata !"x1", metadata !"", metadata !1, i32 3, metadata !5, i1 true, i1 true, i8* @x1, null} ; [ DW_TAG_variable ]
-!14 = metadata !{i32 786484, i32 0, metadata !1, metadata !"x2", metadata !"x2", metadata !"", metadata !1, i32 6, metadata !5, i1 true, i1 true, i8* @x2, null} ; [ DW_TAG_variable ]
-!15 = metadata !{i32 786484, i32 0, metadata !1, metadata !"x3", metadata !"x3", metadata !"", metadata !1, i32 9, metadata !5, i1 true, i1 true, i8* @x3, null} ; [ DW_TAG_variable ]
-!16 = metadata !{i32 786484, i32 0, metadata !1, metadata !"x4", metadata !"x4", metadata !"", metadata !1, i32 12, metadata !5, i1 true, i1 true, i8* @x4, null} ; [ DW_TAG_variable ]
-!17 = metadata !{i32 786484, i32 0, metadata !1, metadata !"x5", metadata !"x5", metadata !"", metadata !1, i32 15, metadata !5, i1 false, i1 true, i8* @x5, null} ; [ DW_TAG_variable ]
-!18 = metadata !{i32 786689, metadata !6, metadata !"a", metadata !1, i32 7, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786688, metadata !20, metadata !"b", metadata !1, i32 7, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!20 = metadata !{i32 786443, metadata !47, metadata !6, i32 7, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
-!21 = metadata !{i32 786689, metadata !7, metadata !"a", metadata !1, i32 10, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{i32 786688, metadata !23, metadata !"b", metadata !1, i32 10, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!23 = metadata !{i32 786443, metadata !47, metadata !7, i32 10, i32 0, i32 2} ; [ DW_TAG_lexical_block ]
-!24 = metadata !{i32 786689, metadata !8, metadata !"a", metadata !1, i32 13, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!25 = metadata !{i32 786688, metadata !26, metadata !"b", metadata !1, i32 13, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!26 = metadata !{i32 786443, metadata !47, metadata !8, i32 13, i32 0, i32 3} ; [ DW_TAG_lexical_block ]
-!27 = metadata !{i32 786689, metadata !9, metadata !"a", metadata !1, i32 16, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!28 = metadata !{i32 786688, metadata !29, metadata !"b", metadata !1, i32 16, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 786443, metadata !47, metadata !9, i32 16, i32 0, i32 4} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00_Bool\000\008\008\000\000\002", metadata !47, metadata !1} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00get2\00get2\00get2\007\000\001\000\006\00256\001\007", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get2, null, null, metadata !43} ; [ DW_TAG_subprogram ]
+!7 = metadata !{metadata !"0x2e\00get3\00get3\00get3\0010\000\001\000\006\00256\001\0010", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get3, null, null, metadata !44} ; [ DW_TAG_subprogram ]
+!8 = metadata !{metadata !"0x2e\00get4\00get4\00get4\0013\000\001\000\006\00256\001\0013", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get4, null, null, metadata !45} ; [ DW_TAG_subprogram ]
+!9 = metadata !{metadata !"0x2e\00get5\00get5\00get5\0016\000\001\000\006\00256\001\0016", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get5, null, null, metadata !46} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x101\00a\004\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{metadata !"0x100\00b\004\000", metadata !12, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!12 = metadata !{metadata !"0xb\004\000\000", metadata !47, metadata !0} ; [ DW_TAG_lexical_block ]
+!13 = metadata !{metadata !"0x34\00x1\00x1\00\003\001\001", metadata !1, metadata !1, metadata !5, i8* @x1, null} ; [ DW_TAG_variable ]
+!14 = metadata !{metadata !"0x34\00x2\00x2\00\006\001\001", metadata !1, metadata !1, metadata !5, i8* @x2, null} ; [ DW_TAG_variable ]
+!15 = metadata !{metadata !"0x34\00x3\00x3\00\009\001\001", metadata !1, metadata !1, metadata !5, i8* @x3, null} ; [ DW_TAG_variable ]
+!16 = metadata !{metadata !"0x34\00x4\00x4\00\0012\001\001", metadata !1, metadata !1, metadata !5, i8* @x4, null} ; [ DW_TAG_variable ]
+!17 = metadata !{metadata !"0x34\00x5\00x5\00\0015\000\001", metadata !1, metadata !1, metadata !5, i8* @x5, null} ; [ DW_TAG_variable ]
+!18 = metadata !{metadata !"0x101\00a\007\000", metadata !6, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x100\00b\007\000", metadata !20, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!20 = metadata !{metadata !"0xb\007\000\001", metadata !47, metadata !6} ; [ DW_TAG_lexical_block ]
+!21 = metadata !{metadata !"0x101\00a\0010\000", metadata !7, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{metadata !"0x100\00b\0010\000", metadata !23, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!23 = metadata !{metadata !"0xb\0010\000\002", metadata !47, metadata !7} ; [ DW_TAG_lexical_block ]
+!24 = metadata !{metadata !"0x101\00a\0013\000", metadata !8, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!25 = metadata !{metadata !"0x100\00b\0013\000", metadata !26, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!26 = metadata !{metadata !"0xb\0013\000\003", metadata !47, metadata !8} ; [ DW_TAG_lexical_block ]
+!27 = metadata !{metadata !"0x101\00a\0016\000", metadata !9, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!28 = metadata !{metadata !"0x100\00b\0016\000", metadata !29, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{metadata !"0xb\0016\000\004", metadata !47, metadata !9} ; [ DW_TAG_lexical_block ]
 !30 = metadata !{i32 4, i32 0, metadata !0, null}
 !31 = metadata !{i32 4, i32 0, metadata !12, null}
 !32 = metadata !{i32 7, i32 0, metadata !6, null}
@@ -127,4 +127,4 @@ entry:
 !46 = metadata !{metadata !27, metadata !28}
 !47 = metadata !{metadata !"foo.c", metadata !"/tmp/"}
 !48 = metadata !{}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/2011-04-12-AlignBug.ll b/test/CodeGen/ARM/2011-04-12-AlignBug.ll
index 97297f7..1a6879e 100644
--- a/test/CodeGen/ARM/2011-04-12-AlignBug.ll
+++ b/test/CodeGen/ARM/2011-04-12-AlignBug.ll
@@ -1,11 +1,10 @@
 ; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-darwin10.0.0"
 
 ; CHECK: align 3
 @.v = private unnamed_addr constant <4 x i32> <i32 1, i32 2, i32 3, i32 4>, align 8
-; CHECK: align 2
-@.strA = private unnamed_addr constant [4 x i8] c"bar\00"
+; CHECK: align 4
+@.strA = private unnamed_addr constant [4 x i64] zeroinitializer
 ; CHECK-NOT: align
 @.strB = private unnamed_addr constant [4 x i8] c"foo\00", align 1
 @.strC = private unnamed_addr constant [4 x i8] c"baz\00", section "__TEXT,__cstring,cstring_literals", align 1
diff --git a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
index ed2840b..ede936c 100644
--- a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
@@ -1,25 +1,23 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
 
 ; Check debug info output for merged global.
 ; DW_AT_location
-; DW_OP_addr
-; DW_OP_plus
-; .long __MergedGlobals
-; DW_OP_constu
-; offset
-
-;CHECK: .long Lset9
-;CHECK-NEXT:        @ DW_AT_type
-;CHECK-NEXT:        @ DW_AT_decl_file
-;CHECK-NEXT:        @ DW_AT_decl_line
-;CHECK-NEXT:        @ DW_AT_location
-;CHECK-NEXT:        .byte   3
-;CHECK-NEXT:        .long   __MergedGlobals
-;CHECK-NEXT:        .byte   16
-; 4 is byte offset of x2 in __MergedGobals
-;CHECK-NEXT:        .byte   4
-;CHECK-NEXT:        .byte   34
+; 0x03 DW_OP_addr
+; 0x.. .long __MergedGlobals
+; 0x10 DW_OP_constu
+; 0x.. offset
+; 0x22 DW_OP_plus
 
+; CHECK: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:    DW_AT_name {{.*}} "x1"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:    DW_AT_location [DW_FORM_exprloc]        (<0x8> 03 [[ADDR:.. .. .. ..]] 10 00 22  )
+; CHECK: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:    DW_AT_name {{.*}} "x2"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:    DW_AT_location [DW_FORM_exprloc]        (<0x8> 03 [[ADDR]] 10 04 22  )
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-macosx10.7.0"
@@ -31,80 +29,77 @@ target triple = "thumbv7-apple-macosx10.7.0"
 @x5 = global i32 0, align 4
 
 define i32 @get1(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !10), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !30
   %1 = load i32* @x1, align 4, !dbg !31
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !11), !dbg !31
+  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !31
   store i32 %a, i32* @x1, align 4, !dbg !31
   ret i32 %1, !dbg !31
 }
 
 define i32 @get2(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !13), !dbg !32
+  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !32
   %1 = load i32* @x2, align 4, !dbg !33
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !14), !dbg !33
+  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !33
   store i32 %a, i32* @x2, align 4, !dbg !33
   ret i32 %1, !dbg !33
 }
 
 define i32 @get3(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !16), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !34
   %1 = load i32* @x3, align 4, !dbg !35
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !17), !dbg !35
+  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !35
   store i32 %a, i32* @x3, align 4, !dbg !35
   ret i32 %1, !dbg !35
 }
 
 define i32 @get4(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !19), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !36
   %1 = load i32* @x4, align 4, !dbg !37
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !20), !dbg !37
+  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !20, metadata !{metadata !"0x102"}), !dbg !37
   store i32 %a, i32* @x4, align 4, !dbg !37
   ret i32 %1, !dbg !37
 }
 
 define i32 @get5(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !27), !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !38
   %1 = load i32* @x5, align 4, !dbg !39
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !28), !dbg !39
+  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !39
   store i32 %a, i32* @x5, align 4, !dbg !39
   ret i32 %1, !dbg !39
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!49}
 
-!0 = metadata !{i32 786449, metadata !47, i32 12, metadata !"clang", i1 true, metadata !"", i32 0, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !48, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get1", metadata !"get1", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get1, null, null, metadata !42, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [get1]
-!2 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00clang\001\00\000\00\001", metadata !47, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !48} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00get1\00get1\00\005\000\001\000\006\00256\001\005", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get1, null, null, metadata !42} ; [ DW_TAG_subprogram ] [line 5] [def] [get1]
+!2 = metadata !{metadata !"0x29", metadata !47} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get2", metadata !"get2", metadata !"", i32 8, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get2, null, null, metadata !43, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [get2]
-!7 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get3", metadata !"get3", metadata !"", i32 11, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get3, null, null, metadata !44, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [get3]
-!8 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get4", metadata !"get4", metadata !"", i32 14, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get4, null, null, metadata !45, i32 14} ; [ DW_TAG_subprogram ] [line 14] [def] [get4]
-!9 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get5", metadata !"get5", metadata !"", i32 17, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get5, null, null, metadata !46, i32 17} ; [ DW_TAG_subprogram ] [line 17] [def] [get5]
-!10 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 16777221, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{i32 786688, metadata !12, metadata !"b", metadata !2, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!12 = metadata !{i32 786443, metadata !47, metadata !1, i32 5, i32 19, i32 0} ; [ DW_TAG_lexical_block ]
-!13 = metadata !{i32 786689, metadata !6, metadata !"a", metadata !2, i32 16777224, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!14 = metadata !{i32 786688, metadata !15, metadata !"b", metadata !2, i32 8, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{i32 786443, metadata !47, metadata !6, i32 8, i32 17, i32 1} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 786689, metadata !7, metadata !"a", metadata !2, i32 16777227, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 786688, metadata !18, metadata !"b", metadata !2, i32 11, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{i32 786443, metadata !47, metadata !7, i32 11, i32 19, i32 2} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{i32 786689, metadata !8, metadata !"a", metadata !2, i32 16777230, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{i32 786688, metadata !21, metadata !"b", metadata !2, i32 14, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!21 = metadata !{i32 786443, metadata !47, metadata !8, i32 14, i32 19, i32 3} ; [ DW_TAG_lexical_block ]
-!22 = metadata !{i32 786484, i32 0, metadata !0, metadata !"x5", metadata !"x5", metadata !"", metadata !2, i32 16, metadata !5, i32 0, i32 1, i32* @x5, null} ; [ DW_TAG_variable ]
-!23 = metadata !{i32 786484, i32 0, metadata !0, metadata !"x4", metadata !"x4", metadata !"", metadata !2, i32 13, metadata !5, i32 1, i32 1, i32* @x4, null} ; [ DW_TAG_variable ]
-!24 = metadata !{i32 786484, i32 0, metadata !0, metadata !"x3", metadata !"x3", metadata !"", metadata !2, i32 10, metadata !5, i32 1, i32 1, i32* @x3, null} ; [ DW_TAG_variable ]
-!25 = metadata !{i32 786484, i32 0, metadata !0, metadata !"x2", metadata !"x2", metadata !"", metadata !2, i32 7, metadata !5, i32 1, i32 1, i32* @x2, null} ; [ DW_TAG_variable ]
-!26 = metadata !{i32 786484, i32 0, metadata !0, metadata !"x1", metadata !"x1", metadata !"", metadata !2, i32 4, metadata !5, i32 1, i32 1, i32* @x1, null} ; [ DW_TAG_variable ]
-!27 = metadata !{i32 786689, metadata !9, metadata !"a", metadata !2, i32 16777233, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!28 = metadata !{i32 786688, metadata !29, metadata !"b", metadata !2, i32 17, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 786443, metadata !47, metadata !9, i32 17, i32 19, i32 4} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00get2\00get2\00\008\000\001\000\006\00256\001\008", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get2, null, null, metadata !43} ; [ DW_TAG_subprogram ] [line 8] [def] [get2]
+!7 = metadata !{metadata !"0x2e\00get3\00get3\00\0011\000\001\000\006\00256\001\0011", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get3, null, null, metadata !44} ; [ DW_TAG_subprogram ] [line 11] [def] [get3]
+!8 = metadata !{metadata !"0x2e\00get4\00get4\00\0014\000\001\000\006\00256\001\0014", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get4, null, null, metadata !45} ; [ DW_TAG_subprogram ] [line 14] [def] [get4]
+!9 = metadata !{metadata !"0x2e\00get5\00get5\00\0017\000\001\000\006\00256\001\0017", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get5, null, null, metadata !46} ; [ DW_TAG_subprogram ] [line 17] [def] [get5]
+!10 = metadata !{metadata !"0x101\00a\0016777221\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{metadata !"0x100\00b\005\000", metadata !12, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!12 = metadata !{metadata !"0xb\005\0019\000", metadata !47, metadata !1} ; [ DW_TAG_lexical_block ]
+!13 = metadata !{metadata !"0x101\00a\0016777224\000", metadata !6, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!14 = metadata !{metadata !"0x100\00b\008\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{metadata !"0xb\008\0017\001", metadata !47, metadata !6} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x101\00a\0016777227\000", metadata !7, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{metadata !"0x100\00b\0011\000", metadata !18, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!18 = metadata !{metadata !"0xb\0011\0019\002", metadata !47, metadata !7} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{metadata !"0x101\00a\0016777230\000", metadata !8, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{metadata !"0x100\00b\0014\000", metadata !21, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!21 = metadata !{metadata !"0xb\0014\0019\003", metadata !47, metadata !8} ; [ DW_TAG_lexical_block ]
+!25 = metadata !{metadata !"0x34\00x1\00x1\00\004\001\001", metadata !0, metadata !2, metadata !5, i32* @x1, null} ; [ DW_TAG_variable ]
+!26 = metadata !{metadata !"0x34\00x2\00x2\00\007\001\001", metadata !0, metadata !2, metadata !5, i32* @x2, null} ; [ DW_TAG_variable ]
+!27 = metadata !{metadata !"0x101\00a\0016777233\000", metadata !9, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!28 = metadata !{metadata !"0x100\00b\0017\000", metadata !29, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{metadata !"0xb\0017\0019\004", metadata !47, metadata !9} ; [ DW_TAG_lexical_block ]
 !30 = metadata !{i32 5, i32 16, metadata !1, null}
 !31 = metadata !{i32 5, i32 32, metadata !12, null}
 !32 = metadata !{i32 8, i32 14, metadata !6, null}
@@ -116,7 +111,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !38 = metadata !{i32 17, i32 16, metadata !9, null}
 !39 = metadata !{i32 17, i32 32, metadata !29, null}
 !40 = metadata !{metadata !1, metadata !6, metadata !7, metadata !8, metadata !9}
-!41 = metadata !{metadata !22, metadata !23, metadata !24, metadata !25, metadata !26}
+!41 = metadata !{metadata !25, metadata !26}
 !42 = metadata !{metadata !10, metadata !11}
 !43 = metadata !{metadata !13, metadata !14}
 !44 = metadata !{metadata !16, metadata !17}
@@ -124,4 +119,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !46 = metadata !{metadata !27, metadata !28}
 !47 = metadata !{metadata !"ss3.c", metadata !"/private/tmp"}
 !48 = metadata !{}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/2014-07-18-earlyclobber-str-post.ll b/test/CodeGen/ARM/2014-07-18-earlyclobber-str-post.ll
new file mode 100644
index 0000000..df7d245
--- /dev/null
+++ b/test/CodeGen/ARM/2014-07-18-earlyclobber-str-post.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=armv7-linux-gnueabihf %s -o - | FileCheck %s
+
+; Check that we don't create an unpredictable STR instruction,
+; e.g. str r0, [r0], #4
+
+define i32* @earlyclobber-str-post(i32* %addr) nounwind {
+; CHECK-LABEL: earlyclobber-str-post
+; CHECK-NOT: str r[[REG:[0-9]+]], [r[[REG]]], #4
+  %val = ptrtoint i32* %addr to i32
+  store i32 %val, i32* %addr
+  %new = getelementptr i32* %addr, i32 1
+  ret i32* %new
+}
+
+define i16* @earlyclobber-strh-post(i16* %addr) nounwind {
+; CHECK-LABEL: earlyclobber-strh-post
+; CHECK-NOT: strh r[[REG:[0-9]+]], [r[[REG]]], #2
+  %val = ptrtoint i16* %addr to i32
+  %tr = trunc i32 %val to i16
+  store i16 %tr, i16* %addr
+  %new = getelementptr i16* %addr, i32 1
+  ret i16* %new
+}
+
+define i8* @earlyclobber-strb-post(i8* %addr) nounwind {
+; CHECK-LABEL: earlyclobber-strb-post
+; CHECK-NOT: strb r[[REG:[0-9]+]], [r[[REG]]], #1
+  %val = ptrtoint i8* %addr to i32
+  %tr = trunc i32 %val to i8
+  store i8 %tr, i8* %addr
+  %new = getelementptr i8* %addr, i32 1
+  ret i8* %new
+}
diff --git a/test/CodeGen/ARM/2014-08-04-muls-it.ll b/test/CodeGen/ARM/2014-08-04-muls-it.ll
new file mode 100644
index 0000000..4636bff
--- /dev/null
+++ b/test/CodeGen/ARM/2014-08-04-muls-it.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple thumbv7-eabi -arm-restrict-it -filetype asm -o - %s \
+; RUN:    | FileCheck %s
+
+define arm_aapcscc i32 @function(i32 %i, i32 %j) {
+entry:
+  %cmp = icmp eq i32 %i, %j
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %mul = mul nsw i32 %i, %i
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %i.addr.0 = phi i32 [ %mul, %if.then ], [ %i, %entry ]
+  ret i32 %i.addr.0
+}
+
+; CHECK-LABEL: function
+; CHECK: cmp r0, r1
+; CHECK: bne [[LABEL:[.*]]]
+; CHECK-NOT: mulseq r0, r0, r0
+; CHECK: [[LABEL]]
+; CHECK: muls r0, r0, r0
+; CHECK: bx lr
+
diff --git a/test/CodeGen/ARM/Windows/alloca.ll b/test/CodeGen/ARM/Windows/alloca.ll
new file mode 100644
index 0000000..6a3d002
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/alloca.ll
@@ -0,0 +1,22 @@
+; RUN: llc -O0 -mtriple thumbv7-windows-itanium -filetype asm -o - %s | FileCheck %s
+
+declare arm_aapcs_vfpcc i32 @num_entries()
+
+define arm_aapcs_vfpcc void @test___builtin_alloca() {
+entry:
+  %array = alloca i8*, align 4
+  %call = call arm_aapcs_vfpcc i32 @num_entries()
+  %mul = mul i32 4, %call
+  %0 = alloca i8, i32 %mul
+  store i8* %0, i8** %array, align 4
+  ret void
+}
+
+; CHECK: bl num_entries
+; CHECK: movs [[R1:r[0-9]+]], #7
+; CHECK: add.w [[R0:r[0-9]+]], [[R1]], [[R0]], lsl #2
+; CHECK: bic [[R0]], [[R0]], #7
+; CHECK: lsrs r4, [[R0]], #2
+; CHECK: bl __chkstk
+; CHECK: sub.w sp, sp, r4
+
diff --git a/test/CodeGen/ARM/aapcs-hfa-code.ll b/test/CodeGen/ARM/aapcs-hfa-code.ll
index 396e838..5545dfd 100644
--- a/test/CodeGen/ARM/aapcs-hfa-code.ll
+++ b/test/CodeGen/ARM/aapcs-hfa-code.ll
@@ -54,12 +54,11 @@ define arm_aapcs_vfpcc void @test_1double({ double } %a) {
 ; CHECK: bl test_1double
 
 ; CHECK-M4F-LABEL: test_1double:
-; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
-; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
-; CHECK-M4F: movt [[ONEHI]], #16368
-; CHECK-M4F-DAG: vmov s0, [[ONELO]]
-; CHECK-M4F-DAG: vmov s1, [[ONEHI]]
+; CHECK-M4F: vldr d0, [[CP_LABEL:.*]]
 ; CHECK-M4F: bl test_1double
+; CHECK-M4F: [[CP_LABEL]]
+; CHECK-M4F-NEXT: .long 0
+; CHECK-M4F-NEXT: .long 1072693248
 
   call arm_aapcs_vfpcc void @test_1double({ double } { double 1.0 })
   ret void
@@ -76,11 +75,10 @@ define arm_aapcs_vfpcc void @test_1double_nosplit([4 x float], [4 x double], [3
 ; CHECK: bl test_1double_nosplit
 
 ; CHECK-M4F-LABEL: test_1double_nosplit:
-; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
 ; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
 ; CHECK-M4F: movt [[ONEHI]], #16368
-; CHECK-M4F-DAG: str [[ONELO]], [sp]
-; CHECK-M4F-DAG: str [[ONEHI]], [sp, #4]
+; CHECK-M4F: strd [[ONELO]], [[ONEHI]], [sp]
 ; CHECK-M4F: bl test_1double_nosplit
   call arm_aapcs_vfpcc void @test_1double_nosplit([4 x float] undef, [4 x double] undef, [3 x float] undef, double 1.0)
   ret void
@@ -92,19 +90,16 @@ define arm_aapcs_vfpcc void @test_1double_misaligned([4 x double], [4 x double],
   call arm_aapcs_vfpcc void @test_1double_misaligned([4 x double] undef, [4 x double] undef, float undef, double 1.0)
 
 ; CHECK-LABEL: test_1double_misaligned:
-; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
-; CHECK-DAG: mov r[[BASE:[0-9]+]], sp
 ; CHECK-DAG: movw [[ONEHI:r[0-9]+]], #0
+; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
 ; CHECK-DAG: movt [[ONEHI]], #16368
-; CHECK-DAG: str [[ONELO]], [r[[BASE]], #8]!
-; CHECK-DAG: str [[ONEHI]], [r[[BASE]], #4]
+; CHECK-DAG: strd [[ONELO]], [[ONEHI]], [sp, #8]
 
 ; CHECK-M4F-LABEL: test_1double_misaligned:
-; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
 ; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
 ; CHECK-M4F: movt [[ONEHI]], #16368
-; CHECK-M4F-DAG: str [[ONELO]], [sp, #8]
-; CHECK-M4F-DAG: str [[ONEHI]], [sp, #12]
+; CHECK-M4F: strd [[ONELO]], [[ONEHI]], [sp, #8]
 ; CHECK-M4F: bl test_1double_misaligned
 
   ret void
diff --git a/test/CodeGen/ARM/adv-copy-opt.ll b/test/CodeGen/ARM/adv-copy-opt.ll
new file mode 100644
index 0000000..f71bf78
--- /dev/null
+++ b/test/CodeGen/ARM/adv-copy-opt.ll
@@ -0,0 +1,38 @@
+; RUN: llc -O1 -mtriple=armv7s-apple-ios -mcpu=swift < %s -disable-adv-copy-opt=true | FileCheck -check-prefix=NOOPT --check-prefix=CHECK %s
+; RUN: llc -O1 -mtriple=armv7s-apple-ios -mcpu=swift < %s -disable-adv-copy-opt=false | FileCheck -check-prefix=OPT --check-prefix=CHECK %s
+; RUN: llc -O1 -mtriple=thumbv7s-apple-ios -mcpu=swift < %s -disable-adv-copy-opt=true | FileCheck -check-prefix=NOOPT --check-prefix=CHECK %s
+; RUN: llc -O1 -mtriple=thumbv7s-apple-ios -mcpu=swift < %s -disable-adv-copy-opt=false | FileCheck -check-prefix=OPT --check-prefix=CHECK %s
+
+; CHECK-LABEL: simpleVectorDiv
+; ABI: %A => r0, r1.
+;      %B => r2, r3
+;      ret => r0, r1
+; We want to compute:
+; r0 = r0 / r2
+; r1 = r1 / r3
+;
+; NOOPT: vmov	[[B:d[0-9]+]], r2, r3
+; NOOPT-NEXT: vmov	[[A:d[0-9]+]], r0, r1
+; Move the low part of B into a register.
+; Unfortunately, we cannot express that the 's' register is the low
+; part of B, i.e., sIdx == BIdx x 2. E.g., B = d1, B_low = s2.
+; NOOPT-NEXT: vmov	[[B_LOW:r[0-9]+]], s{{[0-9]+}}
+; NOOPT-NEXT: vmov	[[A_LOW:r[0-9]+]], s{{[0-9]+}}
+; NOOPT-NEXT: udiv	[[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
+; NOOPT-NEXT: vmov	[[B_HIGH:r[0-9]+]], s{{[0-9]+}}
+; NOOPT-NEXT: vmov	[[A_HIGH:r[0-9]+]], s{{[0-9]+}}
+; NOOPT-NEXT: udiv	[[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
+; NOOPT-NEXT: vmov.32	[[RES:d[0-9]+]][0], [[RES_LOW]]
+; NOOPT-NEXT: vmov.32	[[RES]][1], [[RES_HIGH]]
+; NOOPT-NEXT: vmov	r0, r1, [[RES]]
+; NOOPT-NEXT: bx	lr
+;
+; OPT-NOT: vmov
+; OPT: 	udiv	r0, r0, r2
+; OPT-NEXT: udiv	r1, r1, r3
+; OPT-NEXT: bx	lr
+define <2 x i32> @simpleVectorDiv(<2 x i32> %A, <2 x i32> %B) nounwind {
+entry:
+  %div = udiv <2 x i32> %A, %B
+  ret <2 x i32> %div
+}
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index f55ae10..5a737ad 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -25,9 +25,9 @@
 define i32 @foo_f() {
   ret i32 0
 }
-@bar_f = alias weak %FunTy* @foo_f
+@bar_f = weak alias %FunTy* @foo_f
 
-@bar_i = alias internal i32* @bar
+@bar_i = internal alias i32* @bar
 
 @A = alias bitcast (i32* @bar to i64*)
 
diff --git a/test/CodeGen/ARM/arm32-round-conv.ll b/test/CodeGen/ARM/arm32-round-conv.ll
new file mode 100644
index 0000000..88fb891
--- /dev/null
+++ b/test/CodeGen/ARM/arm32-round-conv.ll
@@ -0,0 +1,117 @@
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=+fp-armv8 | FileCheck %s
+; RUN: llc < %s -mtriple=armv8-linux-gnueabihf -mattr=+fp-armv8 | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: vcvtm.s32.f32
+define i32 @test1(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test2
+; CHECK: vcvtm.u32.f32
+define i32 @test2(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test3
+; CHECK: vcvtm.s32.f64
+define i32 @test3(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test4
+; CHECK: vcvtm.u32.f64
+define i32 @test4(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test5
+; CHECK: vcvtp.s32.f32
+define i32 @test5(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test6
+; CHECK: vcvtp.u32.f32
+define i32 @test6(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test7
+; CHECK: vcvtp.s32.f64
+define i32 @test7(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test8
+; CHECK: vcvtp.u32.f64
+define i32 @test8(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test9
+; CHECK: vcvta.s32.f32
+define i32 @test9(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test10
+; CHECK: vcvta.u32.f32
+define i32 @test10(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test11
+; CHECK: vcvta.s32.f64
+define i32 @test11(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test12
+; CHECK: vcvta.u32.f64
+define i32 @test12(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+declare float @floorf(float) nounwind readnone
+declare double @floor(double) nounwind readnone
+declare float @ceilf(float) nounwind readnone
+declare double @ceil(double) nounwind readnone
+declare float @roundf(float) nounwind readnone
+declare double @round(double) nounwind readnone
diff --git a/test/CodeGen/ARM/arm32-rounding.ll b/test/CodeGen/ARM/arm32-rounding.ll
new file mode 100644
index 0000000..f247648
--- /dev/null
+++ b/test/CodeGen/ARM/arm32-rounding.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s -mtriple=armv8-linux-gnueabihf -mattr=+fp-armv8 | FileCheck --check-prefix=CHECK --check-prefix=DP %s
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabihf -mattr=+fp-armv8,+d16,+fp-only-sp | FileCheck --check-prefix=SP %s
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabihf -mattr=+fp-armv8,+d16 | FileCheck --check-prefix=DP %s
+
+; CHECK-LABEL: test1
+; CHECK: vrintm.f32
+define float @test1(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK-LABEL: test2
+; SP: b floor
+; DP: vrintm.f64
+define double @test2(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK-LABEL: test3
+; CHECK: vrintp.f32
+define float @test3(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK-LABEL: test4
+; SP: b ceil
+; DP: vrintp.f64
+define double @test4(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK-LABEL: test5
+; CHECK: vrinta.f32
+define float @test5(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK-LABEL: test6
+; SP: b round
+; DP: vrinta.f64
+define double @test6(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK-LABEL: test7
+; CHECK: vrintz.f32
+define float @test7(float %a) {
+entry:
+  %call = call float @truncf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK-LABEL: test8
+; SP: b trunc
+; DP: vrintz.f64
+define double @test8(double %a) {
+entry:
+  %call = call double @trunc(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK-LABEL: test9
+; CHECK: vrintr.f32
+define float @test9(float %a) {
+entry:
+  %call = call float @nearbyintf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK-LABEL: test10
+; SP: b nearbyint
+; DP: vrintr.f64
+define double @test10(double %a) {
+entry:
+  %call = call double @nearbyint(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK-LABEL: test11
+; CHECK: vrintx.f32
+define float @test11(float %a) {
+entry:
+  %call = call float @rintf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK-LABEL: test12
+; SP: b rint
+; DP: vrintx.f64
+define double @test12(double %a) {
+entry:
+  %call = call double @rint(double %a) nounwind readnone
+  ret double %call
+}
+
+declare float @floorf(float) nounwind readnone
+declare double @floor(double) nounwind readnone
+declare float @ceilf(float) nounwind readnone
+declare double @ceil(double) nounwind readnone
+declare float @roundf(float) nounwind readnone
+declare double @round(double) nounwind readnone
+declare float @truncf(float) nounwind readnone
+declare double @trunc(double) nounwind readnone
+declare float @nearbyintf(float) nounwind readnone
+declare double @nearbyint(double) nounwind readnone
+declare float @rintf(float) nounwind readnone
+declare double @rint(double) nounwind readnone
diff --git a/test/CodeGen/ARM/atomic-cmpxchg.ll b/test/CodeGen/ARM/atomic-cmpxchg.ll
new file mode 100644
index 0000000..84790be
--- /dev/null
+++ b/test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=thumb-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMB
+
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARMV7
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMBV7
+
+define zeroext i1 @test_cmpxchg_res_i8(i8* %addr, i8 %desired, i8 zeroext %new) {
+entry:
+  %0 = cmpxchg i8* %addr, i8 %desired, i8 %new monotonic monotonic
+  %1 = extractvalue { i8, i1 } %0, 1
+  ret i1 %1
+}
+
+; CHECK-ARM-LABEL: test_cmpxchg_res_i8
+; CHECK-ARM: bl __sync_val_compare_and_swap_1
+; CHECK-ARM: mov [[REG:r[0-9]+]], #0
+; CHECK-ARM: cmp r0, {{r[0-9]+}}
+; CHECK-ARM: moveq [[REG]], #1
+; CHECK-ARM: mov r0, [[REG]]
+
+; CHECK-THUMB-LABEL: test_cmpxchg_res_i8
+; CHECK-THUMB: bl __sync_val_compare_and_swap_1
+; CHECK-THUMB-NOT: mov [[R1:r[0-7]]], r0
+; CHECK-THUMB: push  {r0}
+; CHECK-THUMB: pop {[[R1:r[0-7]]]}
+; CHECK-THUMB: movs r0, #1
+; CHECK-THUMB: movs [[R2:r[0-9]+]], #0
+; CHECK-THUMB: cmp [[R1]], {{r[0-9]+}}
+; CHECK-THU<B: beq
+; CHECK-THUMB: push  {[[R2]]}
+; CHECK-THUMB: pop {r0}
+
+; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8
+; CHECK-ARMV7: ldrexb [[R3:r[0-9]+]], [r0]
+; CHECK-ARMV7: mov [[R1:r[0-9]+]], #0
+; CHECK-ARMV7: cmp [[R3]], {{r[0-9]+}}
+; CHECK-ARMV7: bne
+; CHECK-ARMV7: strexb [[R3]], {{r[0-9]+}}, [{{r[0-9]+}}]
+; CHECK-ARMV7: mov [[R1]], #1
+; CHECK-ARMV7: cmp [[R3]], #0
+; CHECK-ARMV7: bne
+; CHECK-ARMV7: mov r0, [[R1]]
+
+; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8
+; CHECK-THUMBV7: ldrexb [[R3:r[0-9]+]], [r0]
+; CHECK-THUMBV7: cmp [[R3]], {{r[0-9]+}}
+; CHECK-THUMBV7: movne r0, #0
+; CHECK-THUMBV7: bxne lr
+; CHECK-THUMBV7: strexb [[R3]], {{r[0-9]+}}, [{{r[0-9]+}}]
+; CHECK-THUMBV7: cmp [[R3]], #0
+; CHECK-THUMBV7: itt eq
+; CHECK-THUMBV7: moveq r0, #1
+; CHECK-THUMBV7: bxeq lr
diff --git a/test/CodeGen/ARM/atomic-load-store.ll b/test/CodeGen/ARM/atomic-load-store.ll
index 49342d2..af13dfc 100644
--- a/test/CodeGen/ARM/atomic-load-store.ll
+++ b/test/CodeGen/ARM/atomic-load-store.ll
@@ -3,6 +3,8 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s -check-prefix=THUMBTWO
 ; RUN: llc < %s -mtriple=thumbv6-apple-ios | FileCheck %s -check-prefix=THUMBONE
 ; RUN: llc < %s -mtriple=armv4-apple-ios | FileCheck %s -check-prefix=ARMV4
+; RUN: llc < %s -mtriple=armv6-apple-ios | FileCheck %s -check-prefix=ARMV6
+; RUN: llc < %s -mtriple=thumbv7m-apple-ios | FileCheck %s -check-prefix=THUMBM
 
 define void @test1(i32* %ptr, i32 %val1) {
 ; ARM-LABEL: test1
@@ -15,6 +17,14 @@ define void @test1(i32* %ptr, i32 %val1) {
 ; THUMBTWO: dmb {{ish$}}
 ; THUMBTWO-NEXT: str
 ; THUMBTWO-NEXT: dmb {{ish$}}
+; ARMV6-LABEL: test1
+; ARMV6: mcr p15, #0, {{r[0-9]*}}, c7, c10, #5
+; ARMV6: str
+; ARMV6: mcr p15, #0, {{r[0-9]*}}, c7, c10, #5
+; THUMBM-LABEL: test1
+; THUMBM: dmb sy
+; THUMBM: str
+; THUMBM: dmb sy
   store atomic i32 %val1, i32* %ptr seq_cst, align 4
   ret void
 }
@@ -28,6 +38,12 @@ define i32 @test2(i32* %ptr) {
 ; THUMBTWO-LABEL: test2
 ; THUMBTWO: ldr
 ; THUMBTWO-NEXT: dmb {{ish$}}
+; ARMV6-LABEL: test2
+; ARMV6: ldr
+; ARMV6: mcr p15, #0, {{r[0-9]*}}, c7, c10, #5
+; THUMBM-LABEL: test2
+; THUMBM: ldr
+; THUMBM: dmb sy
   %val = load atomic i32* %ptr seq_cst, align 4
   ret i32 %val
 }
@@ -55,6 +71,11 @@ define void @test3(i8* %ptr1, i8* %ptr2) {
 ; THUMBONE-NOT: dmb
 ; THUMBONE: strb
 ; THUMBONE-NOT: dmb
+
+; ARMV6-LABEL: test3
+; ARMV6-NOT: mcr
+; THUMBM-LABEL: test3
+; THUMBM-NOT: dmb sy
   %val = load atomic i8* %ptr1 unordered, align 1
   store atomic i8 %val, i8* %ptr2 unordered, align 1
   ret void
@@ -64,6 +85,8 @@ define void @test4(i8* %ptr1, i8* %ptr2) {
 ; THUMBONE-LABEL: test4
 ; THUMBONE: ___sync_val_compare_and_swap_1
 ; THUMBONE: ___sync_lock_test_and_set_1
+; ARMV6-LABEL: test4
+; THUMBM-LABEL: test4
   %val = load atomic i8* %ptr1 seq_cst, align 1
   store atomic i8 %val, i8* %ptr2 seq_cst, align 1
   ret void
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index b988242..1ac8648 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -1,7 +1,10 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-T1
-; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs -mcpu=cortex-m0 | FileCheck %s --check-prefix=CHECK-T1
+; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs -mcpu=cortex-m0 | FileCheck %s --check-prefix=CHECK-M0
+; RUN: llc < %s -mtriple=thumbv7--none-eabi -thread-model single -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-BAREMETAL
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @func(i32 %argc, i8** %argv) nounwind {
 entry:
@@ -27,48 +30,72 @@ entry:
   ; CHECK: add
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_add_4
+  ; CHECK-M0: bl ___sync_fetch_and_add_4
+  ; CHECK-BAREMETAL: add
+  ; CHECK-BAREMETAL-NOT: __sync
   %0 = atomicrmw add i32* %val1, i32 %tmp monotonic
 	store i32 %0, i32* %old
   ; CHECK: ldrex
   ; CHECK: sub
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_sub_4
+  ; CHECK-M0: bl ___sync_fetch_and_sub_4
+  ; CHECK-BAREMETAL: sub
+  ; CHECK-BAREMETAL-NOT: __sync
   %1 = atomicrmw sub i32* %val2, i32 30 monotonic
 	store i32 %1, i32* %old
   ; CHECK: ldrex
   ; CHECK: add
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_add_4
+  ; CHECK-M0: bl ___sync_fetch_and_add_4
+  ; CHECK-BAREMETAL: add
+  ; CHECK-BAREMETAL-NOT: __sync
   %2 = atomicrmw add i32* %val2, i32 1 monotonic
 	store i32 %2, i32* %old
   ; CHECK: ldrex
   ; CHECK: sub
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_sub_4
+  ; CHECK-M0: bl ___sync_fetch_and_sub_4
+  ; CHECK-BAREMETAL: sub
+  ; CHECK-BAREMETAL-NOT: __sync
   %3 = atomicrmw sub i32* %val2, i32 1 monotonic
 	store i32 %3, i32* %old
   ; CHECK: ldrex
   ; CHECK: and
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_and_4
+  ; CHECK-M0: bl ___sync_fetch_and_and_4
+  ; CHECK-BAREMETAL: and
+  ; CHECK-BAREMETAL-NOT: __sync
   %4 = atomicrmw and i32* %andt, i32 4080 monotonic
 	store i32 %4, i32* %old
   ; CHECK: ldrex
   ; CHECK: or
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_or_4
+  ; CHECK-M0: bl ___sync_fetch_and_or_4
+  ; CHECK-BAREMETAL: or
+  ; CHECK-BAREMETAL-NOT: __sync
   %5 = atomicrmw or i32* %ort, i32 4080 monotonic
 	store i32 %5, i32* %old
   ; CHECK: ldrex
   ; CHECK: eor
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_xor_4
+  ; CHECK-M0: bl ___sync_fetch_and_xor_4
+  ; CHECK-BAREMETAL: eor
+  ; CHECK-BAREMETAL-NOT: __sync
   %6 = atomicrmw xor i32* %xort, i32 4080 monotonic
 	store i32 %6, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_min_4
+  ; CHECK-M0: bl ___sync_fetch_and_min_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %7 = atomicrmw min i32* %val2, i32 16 monotonic
 	store i32 %7, i32* %old
 	%neg = sub i32 0, 1
@@ -76,24 +103,36 @@ entry:
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_min_4
+  ; CHECK-M0: bl ___sync_fetch_and_min_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %8 = atomicrmw min i32* %val2, i32 %neg monotonic
 	store i32 %8, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_max_4
+  ; CHECK-M0: bl ___sync_fetch_and_max_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %9 = atomicrmw max i32* %val2, i32 1 monotonic
 	store i32 %9, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_max_4
+  ; CHECK-M0: bl ___sync_fetch_and_max_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %10 = atomicrmw max i32* %val2, i32 0 monotonic
 	store i32 %10, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umin_4
+  ; CHECK-M0: bl ___sync_fetch_and_umin_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %11 = atomicrmw umin i32* %val2, i32 16 monotonic
 	store i32 %11, i32* %old
 	%uneg = sub i32 0, 1
@@ -101,18 +140,27 @@ entry:
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umin_4
+  ; CHECK-M0: bl ___sync_fetch_and_umin_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %12 = atomicrmw umin i32* %val2, i32 %uneg monotonic
 	store i32 %12, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umax_4
+  ; CHECK-M0: bl ___sync_fetch_and_umax_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %13 = atomicrmw umax i32* %val2, i32 1 monotonic
 	store i32 %13, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umax_4
+  ; CHECK-M0: bl ___sync_fetch_and_umax_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %14 = atomicrmw umax i32* %val2, i32 0 monotonic
 	store i32 %14, i32* %old
 
@@ -128,6 +176,9 @@ entry:
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umin_2
+  ; CHECK-M0: bl ___sync_fetch_and_umin_2
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %0 = atomicrmw umin i16* %val, i16 16 monotonic
   store i16 %0, i16* %old
   %uneg = sub i16 0, 1
@@ -135,18 +186,27 @@ entry:
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umin_2
+  ; CHECK-M0: bl ___sync_fetch_and_umin_2
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %1 = atomicrmw umin i16* %val, i16 %uneg monotonic
   store i16 %1, i16* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umax_2
+  ; CHECK-M0: bl ___sync_fetch_and_umax_2
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %2 = atomicrmw umax i16* %val, i16 1 monotonic
   store i16 %2, i16* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umax_2
+  ; CHECK-M0: bl ___sync_fetch_and_umax_2
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %3 = atomicrmw umax i16* %val, i16 0 monotonic
   store i16 %3, i16* %old
   ret void
@@ -161,12 +221,18 @@ entry:
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umin_1
+  ; CHECK-M0: bl ___sync_fetch_and_umin_1
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %0 = atomicrmw umin i8* %val, i8 16 monotonic
   store i8 %0, i8* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umin_1
+  ; CHECK-M0: bl ___sync_fetch_and_umin_1
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %uneg = sub i8 0, 1
   %1 = atomicrmw umin i8* %val, i8 %uneg monotonic
   store i8 %1, i8* %old
@@ -174,12 +240,18 @@ entry:
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umax_1
+  ; CHECK-M0: bl ___sync_fetch_and_umax_1
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %2 = atomicrmw umax i8* %val, i8 1 monotonic
   store i8 %2, i8* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umax_1
+  ; CHECK-M0: bl ___sync_fetch_and_umax_1
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %3 = atomicrmw umax i8* %val, i8 0 monotonic
   store i8 %3, i8* %old
   ret void
@@ -233,3 +305,69 @@ define i32 @test_cmpxchg_fail_order1(i32 *%addr, i32 %desired, i32 %new) {
 
   ret i32 %oldval
 }
+
+define i32 @load_load_add_acquire(i32* %mem1, i32* %mem2) nounwind {
+; CHECK-LABEL: load_load_add_acquire
+  %val1 = load atomic i32* %mem1 acquire, align 4
+  %val2 = load atomic i32* %mem2 acquire, align 4
+  %tmp = add i32 %val1, %val2
+
+; CHECK: ldr {{r[0-9]}}, [r0]
+; CHECK: dmb
+; CHECK: ldr {{r[0-9]}}, [r1]
+; CHECK: dmb
+; CHECK: add r0,
+
+; CHECK-M0: ___sync_val_compare_and_swap_4
+; CHECK-M0: ___sync_val_compare_and_swap_4
+
+; CHECK-BAREMETAL: ldr {{r[0-9]}}, [r0]
+; CHECK-BAREMETAL-NOT: dmb
+; CHECK-BAREMETAL: ldr {{r[0-9]}}, [r1]
+; CHECK-BAREMETAL-NOT: dmb
+; CHECK-BAREMETAL: add r0,
+
+  ret i32 %tmp
+}
+
+define void @store_store_release(i32* %mem1, i32 %val1, i32* %mem2, i32 %val2) {
+; CHECK-LABEL: store_store_release
+  store atomic i32 %val1, i32* %mem1 release, align 4
+  store atomic i32 %val2, i32* %mem2 release, align 4
+
+; CHECK: dmb
+; CHECK: str r1, [r0]
+; CHECK: dmb
+; CHECK: str r3, [r2]
+
+; CHECK-M0: ___sync_lock_test_and_set
+; CHECK-M0: ___sync_lock_test_and_set
+
+; CHECK-BAREMETAL-NOT: dmb
+; CHECK-BAREMTEAL: str r1, [r0]
+; CHECK-BAREMETAL-NOT: dmb
+; CHECK-BAREMTEAL: str r3, [r2]
+
+  ret void
+}
+
+define void @load_fence_store_monotonic(i32* %mem1, i32* %mem2) {
+; CHECK-LABEL: load_fence_store_monotonic
+  %val = load atomic i32* %mem1 monotonic, align 4
+  fence seq_cst
+  store atomic i32 %val, i32* %mem2 monotonic, align 4
+
+; CHECK: ldr [[R0:r[0-9]]], [r0]
+; CHECK: dmb
+; CHECK: str [[R0]], [r1]
+
+; CHECK-M0: ldr [[R0:r[0-9]]], [r0]
+; CHECK-M0: dmb
+; CHECK-M0: str [[R0]], [r1]
+
+; CHECK-BAREMETAL: ldr [[R0:r[0-9]]], [r0]
+; CHECK-BAREMETAL-NOT: dmb
+; CHECK-BAREMETAL: str [[R0]], [r1]
+
+  ret void
+}
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index d75d55d..99c2445 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -20,12 +20,16 @@
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A9-HARD
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 | FileCheck %s --check-prefix=CORTEX-A12-DEFAULT
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-A12-NOFPU
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9-mp | FileCheck %s --check-prefix=CORTEX-A9-MP
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 | FileCheck %s --check-prefix=CORTEX-A15
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 | FileCheck %s --check-prefix=CORTEX-A17-DEFAULT
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-A17-NOFPU
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=CORTEX-M0
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 | FileCheck %s --check-prefix=CORTEX-M3
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-M4-SOFT
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-M4-HARD
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-SOFT
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=+fp-only-sp | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-SINGLE
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-DOUBLE
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=CORTEX-R5
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=CORTEX-A57
@@ -38,6 +42,41 @@
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=default | FileCheck %s --check-prefix=RELOC-OTHER
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=dynamic-no-pic | FileCheck %s --check-prefix=RELOC-OTHER
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi | FileCheck %s --check-prefix=PCS-R9-USE
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -arm-reserve-r9 | FileCheck %s --check-prefix=PCS-R9-RESERVE
+
+; ARMv8a (AArch32)
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; ARMv7a
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; ARMv7r
+; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; ARMv7m
+; RUN: llc < %s -mtriple=thumbv7m-none-linux-gnueabi -mcpu=cortex-m3 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv7m-none-linux-gnueabi -mcpu=cortex-m3 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv7m-none-linux-gnueabi -mcpu=cortex-m3 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; ARMv6
+; RUN: llc < %s -mtriple=armv6-none-netbsd-gnueabi -mcpu=arm1136j-s | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; ARMv6m
+; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -arm-no-strict-align -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -arm-strict-align -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv6m-none-linux-gnueabi -arm-no-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv6m-none-linux-gnueabi -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
+; ARMv5
+; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi -mcpu=arm1022e -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi -mcpu=arm1022e -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi -mcpu=arm1022e | FileCheck %s --check-prefix=STRICT-ALIGN
 
 ; XSCALE:      .eabi_attribute 6, 5
 ; XSCALE:      .eabi_attribute 8, 1
@@ -132,6 +171,10 @@
 ; V8-FPARMv8-NEON-CRYPTO: .fpu crypto-neon-fp-armv8
 ; V8-FPARMv8-NEON-CRYPTO: .eabi_attribute 12, 3
 
+; Tag_CPU_unaligned_access
+; NO-STRICT-ALIGN: .eabi_attribute 34, 1
+; STRICT-ALIGN: .eabi_attribute 34, 0
+
 ; Tag_CPU_arch	'ARMv7'
 ; CORTEX-A7-CHECK: .eabi_attribute	6, 10
 ; CORTEX-A7-NOFPU: .eabi_attribute	6, 10
@@ -257,7 +300,7 @@
 ; CORTEX-A9-SOFT-NOT:  .eabi_attribute 27
 ; CORTEX-A9-SOFT-NOT:  .eabi_attribute 28
 ; CORTEX-A9-SOFT:  .eabi_attribute 36, 1
-; CORTEX-A9-SOFT-NOT:  .eabi_attribute 42
+; CORTEX-A9-SOFT:  .eabi_attribute 42, 1
 ; CORTEX-A9-SOFT:  .eabi_attribute 68, 1
 
 ; CORTEX-A9-HARD:  .cpu cortex-a9
@@ -274,26 +317,9 @@
 ; CORTEX-A9-HARD-NOT:  .eabi_attribute 27
 ; CORTEX-A9-HARD:  .eabi_attribute 28, 1
 ; CORTEX-A9-HARD:  .eabi_attribute 36, 1
-; CORTEX-A9-HARD-NOT:  .eabi_attribute 42
+; CORTEX-A9-HARD:  .eabi_attribute 42, 1
 ; CORTEX-A9-HARD:  .eabi_attribute 68, 1
 
-; CORTEX-A9-MP:  .cpu cortex-a9-mp
-; CORTEX-A9-MP:  .eabi_attribute 6, 10
-; CORTEX-A9-MP:  .eabi_attribute 7, 65
-; CORTEX-A9-MP:  .eabi_attribute 8, 1
-; CORTEX-A9-MP:  .eabi_attribute 9, 2
-; CORTEX-A9-MP:  .fpu neon
-; CORTEX-A9-MP:  .eabi_attribute 20, 1
-; CORTEX-A9-MP:  .eabi_attribute 21, 1
-; CORTEX-A9-MP:  .eabi_attribute 23, 3
-; CORTEX-A9-MP:  .eabi_attribute 24, 1
-; CORTEX-A9-MP:  .eabi_attribute 25, 1
-; CORTEX-A9-MP-NOT:  .eabi_attribute 27
-; CORTEX-A9-MP-NOT:  .eabi_attribute 28
-; CORTEX-A9-MP:  .eabi_attribute 36, 1
-; CORTEX-A9-MP:  .eabi_attribute 42, 1
-; CORTEX-A9-MP:  .eabi_attribute 68, 1
-
 ; CORTEX-A12-DEFAULT:  .cpu cortex-a12
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 6, 10
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 7, 65
@@ -342,6 +368,36 @@
 ; CORTEX-A15: .eabi_attribute 44, 2
 ; CORTEX-A15: .eabi_attribute 68, 3
 
+; CORTEX-A17-DEFAULT:  .cpu cortex-a17
+; CORTEX-A17-DEFAULT:  .eabi_attribute 6, 10
+; CORTEX-A17-DEFAULT:  .eabi_attribute 7, 65
+; CORTEX-A17-DEFAULT:  .eabi_attribute 8, 1
+; CORTEX-A17-DEFAULT:  .eabi_attribute 9, 2
+; CORTEX-A17-DEFAULT:  .fpu neon-vfpv4
+; CORTEX-A17-DEFAULT:  .eabi_attribute 20, 1
+; CORTEX-A17-DEFAULT:  .eabi_attribute 21, 1
+; CORTEX-A17-DEFAULT:  .eabi_attribute 23, 3
+; CORTEX-A17-DEFAULT:  .eabi_attribute 24, 1
+; CORTEX-A17-DEFAULT:  .eabi_attribute 25, 1
+; CORTEX-A17-DEFAULT:  .eabi_attribute 42, 1
+; CORTEX-A17-DEFAULT:  .eabi_attribute 44, 2
+; CORTEX-A17-DEFAULT:  .eabi_attribute 68, 3
+
+; CORTEX-A17-NOFPU:  .cpu cortex-a17
+; CORTEX-A17-NOFPU:  .eabi_attribute 6, 10
+; CORTEX-A17-NOFPU:  .eabi_attribute 7, 65
+; CORTEX-A17-NOFPU:  .eabi_attribute 8, 1
+; CORTEX-A17-NOFPU:  .eabi_attribute 9, 2
+; CORTEX-A17-NOFPU-NOT:  .fpu
+; CORTEX-A17-NOFPU:  .eabi_attribute 20, 1
+; CORTEX-A17-NOFPU:  .eabi_attribute 21, 1
+; CORTEX-A17-NOFPU:  .eabi_attribute 23, 3
+; CORTEX-A17-NOFPU:  .eabi_attribute 24, 1
+; CORTEX-A17-NOFPU:  .eabi_attribute 25, 1
+; CORTEX-A17-NOFPU:  .eabi_attribute 42, 1
+; CORTEX-A17-NOFPU:  .eabi_attribute 44, 2
+; CORTEX-A17-NOFPU:  .eabi_attribute 68, 3
+
 ; CORTEX-M0:  .cpu cortex-m0
 ; CORTEX-M0:  .eabi_attribute 6, 12
 ; CORTEX-M0-NOT:  .eabi_attribute 7
@@ -408,6 +464,26 @@
 ; CORTEX-M4-HARD-NOT:  .eabi_attribute 44
 ; CORTEX-M4-HARD-NOT:  .eabi_attribute 68
 
+; CORTEX-M7:  .cpu    cortex-m7
+; CORTEX-M7:  .eabi_attribute 6, 13
+; CORTEX-M7:  .eabi_attribute 7, 77
+; CORTEX-M7:  .eabi_attribute 8, 0
+; CORTEX-M7:  .eabi_attribute 9, 2
+; CORTEX-M7-SOFT-NOT: .fpu
+; CORTEX-M7-SINGLE:  .fpu fpv5-d16
+; CORTEX-M7-DOUBLE:  .fpu fpv5-d16
+; CORTEX-M7:  .eabi_attribute 17, 1
+; CORTEX-M7:  .eabi_attribute 20, 1
+; CORTEX-M7:  .eabi_attribute 21, 1
+; CORTEX-M7:  .eabi_attribute 23, 3
+; CORTEX-M7:  .eabi_attribute 24, 1
+; CORTEX-M7:  .eabi_attribute 25, 1
+; CORTEX-M7-SOFT-NOT: .eabi_attribute 27
+; CORTEX-M7-SINGLE:  .eabi_attribute 27, 1
+; CORTEX-M7-DOUBLE-NOT: .eabi_attribute 27
+; CORTEX-M7:  .eabi_attribute 36, 1
+; CORTEX-M7:  .eabi_attribute 14, 0
+
 ; CORTEX-R5:  .cpu cortex-r5
 ; CORTEX-R5:  .eabi_attribute 6, 10
 ; CORTEX-R5:  .eabi_attribute 7, 82
@@ -463,6 +539,9 @@
 ; RELOC-PIC:  .eabi_attribute 17, 2
 ; RELOC-OTHER:  .eabi_attribute 17, 1
 
+; PCS-R9-USE:  .eabi_attribute 14, 0
+; PCS-R9-RESERVE:  .eabi_attribute 14, 3
+
 define i32 @f(i64 %z) {
 	ret i32 0
 }
diff --git a/test/CodeGen/ARM/carry.ll b/test/CodeGen/ARM/carry.ll
index e344b08..7ea9be2 100644
--- a/test/CodeGen/ARM/carry.ll
+++ b/test/CodeGen/ARM/carry.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/ARM/coalesce-dbgvalue.ll b/test/CodeGen/ARM/coalesce-dbgvalue.ll
index 606c9bc..47d81a6 100644
--- a/test/CodeGen/ARM/coalesce-dbgvalue.ll
+++ b/test/CodeGen/ARM/coalesce-dbgvalue.ll
@@ -27,11 +27,11 @@ for.cond1:                                        ; preds = %for.end9, %for.cond
 
 for.body2:                                        ; preds = %for.cond1
   store i32 %storemerge11, i32* @b, align 4, !dbg !26
-  tail call void @llvm.dbg.value(metadata !27, i64 0, metadata !11), !dbg !28
+  tail call void @llvm.dbg.value(metadata !27, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !28
   %0 = load i64* @a, align 8, !dbg !29
   %xor = xor i64 %0, %e.1.ph, !dbg !29
   %conv3 = trunc i64 %xor to i32, !dbg !29
-  tail call void @llvm.dbg.value(metadata !{i32 %conv3}, i64 0, metadata !10), !dbg !29
+  tail call void @llvm.dbg.value(metadata !{i32 %conv3}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !29
   %tobool4 = icmp eq i32 %conv3, 0, !dbg !29
   br i1 %tobool4, label %land.end, label %land.rhs, !dbg !29
 
@@ -69,7 +69,7 @@ declare i32 @fn2(...) #1
 declare i32 @fn3(...) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -79,33 +79,33 @@ attributes #3 = { nounwind }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 182024) (llvm/trunk 182023)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/d/b/pr16110.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 182024) (llvm/trunk 182023)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2} ; [ DW_TAG_compile_unit ] [/d/b/pr16110.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"pr16110.c", metadata !"/d/b"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"pr16110", metadata !"pr16110", metadata !"", i32 7, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @pr16110, null, null, metadata !9, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [pr16110]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/d/b/pr16110.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00pr16110\00pr16110\00\007\000\001\000\006\000\001\007", metadata !1, metadata !5, metadata !6, null, i32 ()* @pr16110, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 7] [def] [pr16110]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/d/b/pr16110.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10, metadata !11}
-!10 = metadata !{i32 786688, metadata !4, metadata !"e", metadata !5, i32 8, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [e] [line 8]
-!11 = metadata !{i32 786688, metadata !12, metadata !"f", metadata !5, i32 13, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [f] [line 13]
-!12 = metadata !{i32 786443, metadata !1, metadata !13, i32 12, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
-!13 = metadata !{i32 786443, metadata !1, metadata !4, i32 12, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
-!14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
+!10 = metadata !{metadata !"0x100\00e\008\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [e] [line 8]
+!11 = metadata !{metadata !"0x100\00f\0013\000", metadata !12, metadata !5, metadata !14} ; [ DW_TAG_auto_variable ] [f] [line 13]
+!12 = metadata !{metadata !"0xb\0012\000\002", metadata !1, metadata !13} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
+!13 = metadata !{metadata !"0xb\0012\000\001", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
+!14 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
 !15 = metadata !{metadata !16, metadata !18, metadata !19, metadata !20}
-!16 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !5, i32 1, metadata !17, i32 0, i32 1, i64* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
-!17 = metadata !{i32 786468, null, null, metadata !"long long int", i32 0, i64 64, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long long int] [line 0, size 64, align 32, offset 0, enc DW_ATE_signed]
-!18 = metadata !{i32 786484, i32 0, null, metadata !"b", metadata !"b", metadata !"", metadata !5, i32 2, metadata !8, i32 0, i32 1, i32* @b, null} ; [ DW_TAG_variable ] [b] [line 2] [def]
-!19 = metadata !{i32 786484, i32 0, null, metadata !"c", metadata !"c", metadata !"", metadata !5, i32 3, metadata !8, i32 0, i32 1, i32* @c, null} ; [ DW_TAG_variable ] [c] [line 3] [def]
-!20 = metadata !{i32 786484, i32 0, null, metadata !"d", metadata !"d", metadata !"", metadata !5, i32 4, metadata !8, i32 0, i32 1, i32* @d, null} ; [ DW_TAG_variable ] [d] [line 4] [def]
+!16 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !5, metadata !17, i64* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!17 = metadata !{metadata !"0x24\00long long int\000\0064\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [long long int] [line 0, size 64, align 32, offset 0, enc DW_ATE_signed]
+!18 = metadata !{metadata !"0x34\00b\00b\00\002\000\001", null, metadata !5, metadata !8, i32* @b, null} ; [ DW_TAG_variable ] [b] [line 2] [def]
+!19 = metadata !{metadata !"0x34\00c\00c\00\003\000\001", null, metadata !5, metadata !8, i32* @c, null} ; [ DW_TAG_variable ] [c] [line 3] [def]
+!20 = metadata !{metadata !"0x34\00d\00d\00\004\000\001", null, metadata !5, metadata !8, i32* @d, null} ; [ DW_TAG_variable ] [d] [line 4] [def]
 !21 = metadata !{i32 10, i32 0, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !1, metadata !4, i32 10, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
+!22 = metadata !{metadata !"0xb\0010\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
 !26 = metadata !{i32 12, i32 0, metadata !13, null}
 !27 = metadata !{i32* null}
 !28 = metadata !{i32 13, i32 0, metadata !12, null}
 !29 = metadata !{i32 14, i32 0, metadata !12, null}
 !31 = metadata !{i32 16, i32 0, metadata !4, null}
 !32 = metadata !{i32 18, i32 0, metadata !4, null}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/constant-islands.ll b/test/CodeGen/ARM/constant-islands.ll
new file mode 100644
index 0000000..afa4b85
--- /dev/null
+++ b/test/CodeGen/ARM/constant-islands.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabihf -O0 -fast-isel=0 -o - %s | FileCheck %s
+
+define void @test_no_duplicate_branches(float %in) {
+; CHECK-LABEL: test_no_duplicate_branches:
+; CHECK: vldr {{s[0-9]+}}, [[CONST:\.LCPI[0-9]+_[0-9]+]]
+; CHECK: b .LBB
+; CHECK-NOT: b .LBB
+; CHECK: [[CONST]]:
+; CHECK-NEXT: .long 1150963712
+
+  %tst = fcmp oeq float %in, 1234.5
+
+  %chain = zext i1 %tst to i32
+
+  br i1 %tst, label %true, label %false
+
+true:
+  call i32 @llvm.arm.space(i32 2000, i32 undef)
+  ret void
+
+false:
+  ret void
+}
+
+declare i32 @llvm.arm.space(i32, i32)
diff --git a/test/CodeGen/ARM/copy-cpsr.ll b/test/CodeGen/ARM/copy-cpsr.ll
new file mode 100644
index 0000000..8b7dc03
--- /dev/null
+++ b/test/CodeGen/ARM/copy-cpsr.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=armv7s-apple-ios7.0 -show-mc-encoding %s -o - | FileCheck %s --check-prefix=CHECK-ARM
+; RUN: llc -mtriple=thumbv7s-apple-ios7.0 -show-mc-encoding %s -o - | FileCheck %s --check-prefix=CHECK-THUMB
+; RUN: llc -mtriple=thumbv7m-none-eabi -show-mc-encoding %s -o - | FileCheck %s --check-prefix=CHECK-THUMB
+
+; In the ARM backend, most compares are glued to their uses so CPSR can't
+; escape. However, for long ADCS chains (and last ditch fallback) the dependency
+; is carried in the DAG because duplicating them can be more expensive than
+; copying CPSR.
+
+; Crafting a test for this was a little tricky, in case it breaks here are some
+; notes on what I was tring to achieve:
+;   + We want 2 long ADCS chains
+;   + We want them to split after an initial common prefix (so that a single
+;     CPSR is used twice).
+;   + We want both chains to write CPSR post-split (so that the copy can't be
+;     elided).
+;   + We want the chains to be long enough that duplicating them is expensive.
+
+define void @test_copy_cpsr(i128 %lhs, i128 %rhs, i128* %addr) {
+; CHECK-ARM: test_copy_cpsr:
+; CHECK-THUMB: test_copy_cpsr:
+
+; CHECK-ARM: mrs [[TMP:r[0-9]+]], apsr @ encoding: [0x00,0x{{[0-9a-f]}}0,0x0f,0xe1]
+; CHECK-ARM: msr APSR_nzcvq, [[TMP]] @ encoding: [0x0{{[0-9a-f]}},0xf0,0x28,0xe1]
+
+  ; In Thumb mode v7M and v7AR have different MRS/MSR instructions that happen
+  ; to overlap for the apsr case, so it's definitely worth checking both.
+; CHECK-THUMB: mrs [[TMP:r[0-9]+]], apsr @ encoding: [0xef,0xf3,0x00,0x8{{[0-9a-f]}}]
+; CHECK-THUMB: msr {{APSR|apsr}}_nzcvq, [[TMP]] @ encoding: [0x8{{[0-9a-f]}},0xf3,0x00,0x88]
+
+  %sum = add i128 %lhs, %rhs
+  store volatile i128 %sum, i128* %addr
+
+  %rhs2.tmp1 = trunc i128 %rhs to i64
+  %rhs2 = zext i64 %rhs2.tmp1 to i128
+
+  %sum2 = add i128 %lhs, %rhs2
+  store volatile i128 %sum2, i128* %addr
+
+  ret void
+}
diff --git a/test/CodeGen/ARM/darwin-eabi.ll b/test/CodeGen/ARM/darwin-eabi.ll
index f2cde71..5301c0b 100644
--- a/test/CodeGen/ARM/darwin-eabi.ll
+++ b/test/CodeGen/ARM/darwin-eabi.ll
@@ -7,7 +7,7 @@ define float @float_op(float %lhs, float %rhs) {
   %sum = fadd float %lhs, %rhs
   ret float %sum
 ; CHECK-M3-LABEL: float_op:
-; CHECK-M3: blx ___addsf3
+; CHECK-M3: bl ___addsf3
 
 ; CHECK-M4-LABEL: float_op:
 ; CHECK-M4: vadd.f32
@@ -17,8 +17,8 @@ define double @double_op(double %lhs, double %rhs) {
   %sum = fadd double %lhs, %rhs
   ret double %sum
 ; CHECK-M3-LABEL: double_op:
-; CHECK-M3: blx ___adddf3
+; CHECK-M3: bl ___adddf3
 
 ; CHECK-M4-LABEL: double_op:
-; CHECK-M4: blx ___adddf3
+; CHECK-M4: {{(blx|b.w)}} ___adddf3
 }
diff --git a/test/CodeGen/ARM/dbg.ll b/test/CodeGen/ARM/dbg.ll
new file mode 100644
index 0000000..8bce1a6
--- /dev/null
+++ b/test/CodeGen/ARM/dbg.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple armv8-eabi -mcpu=cortex-a57 -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv8-eabi -mcpu=cortex-a57 -o - %s | FileCheck %s
+
+define void @hint_dbg() {
+entry:
+  call void @llvm.arm.dbg(i32 0)
+  ret void
+}
+
+declare void @llvm.arm.dbg(i32)
+
+; CHECK: dbg #0
+
diff --git a/test/CodeGen/ARM/debug-frame-large-stack.ll b/test/CodeGen/ARM/debug-frame-large-stack.ll
index 5bafce9..1addf63 100644
--- a/test/CodeGen/ARM/debug-frame-large-stack.ll
+++ b/test/CodeGen/ARM/debug-frame-large-stack.ll
@@ -1,28 +1,11 @@
-; RUN: llc -filetype=asm -o - < %s -mtriple arm-arm-none-eabi -disable-fp-elim| FileCheck %s --check-prefix=CHECK-ARM
-; RUN: llc -filetype=asm -o - < %s -mtriple arm-arm-none-eabi | FileCheck %s --check-prefix=CHECK-ARM-FP-ELIM
+; RUN: llc -filetype=asm -o - < %s -mtriple arm-arm-netbsd-eabi -disable-fp-elim| FileCheck %s --check-prefix=CHECK-ARM
+; RUN: llc -filetype=asm -o - < %s -mtriple arm-arm-netbsd-eabi | FileCheck %s --check-prefix=CHECK-ARM-FP-ELIM
 
 define void @test1() {
     %tmp = alloca [ 64 x i32 ] , align 4
     ret void
 }
 
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8, !9}
-!llvm.ident = !{!10}
-
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/large.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"large.c", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test1", metadata !"test1", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @test1, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [test1]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/large.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!10 = metadata !{metadata !"clang version 3.5 "}
-!11 = metadata !{i32 2, i32 0, metadata !4, null}
-
 ; CHECK-ARM-LABEL: test1:
 ; CHECK-ARM: .cfi_startproc
 ; CHECK-ARM: sub    sp, sp, #256
diff --git a/test/CodeGen/ARM/debug-frame-vararg.ll b/test/CodeGen/ARM/debug-frame-vararg.ll
index 42ff82d..ffc1a6a 100644
--- a/test/CodeGen/ARM/debug-frame-vararg.ll
+++ b/test/CodeGen/ARM/debug-frame-vararg.ll
@@ -25,37 +25,37 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"var.c", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"sum", metadata !"sum", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, ...)* @sum, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00sum\00sum\00\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, i32 (i32, ...)* @sum, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5 "}
-!12 = metadata !{i32 786689, metadata !4, metadata !"count", metadata !5, i32 16777221, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [count] [line 5]
+!12 = metadata !{metadata !"0x101\00count\0016777221\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [count] [line 5]
 !13 = metadata !{i32 5, i32 0, metadata !4, null}
-!14 = metadata !{i32 786688, metadata !4, metadata !"vl", metadata !5, i32 6, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vl] [line 6]
-!15 = metadata !{i32 786454, metadata !16, null, metadata !"va_list", i32 30, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
+!14 = metadata !{metadata !"0x100\00vl\006\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [vl] [line 6]
+!15 = metadata !{metadata !"0x16\00va_list\0030\000\000\000\000", metadata !16, null, metadata !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
 !16 = metadata !{metadata !"/linux-x86_64-high/gcc_4.7.2/dbg/llvm/bin/../lib/clang/3.5/include/stdarg.h", metadata !"/tmp"}
-!17 = metadata !{i32 786454, metadata !1, null, metadata !"__builtin_va_list", i32 6, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
-!18 = metadata !{i32 786451, metadata !1, null, metadata !"__va_list", i32 6, i64 32, i64 32, i32 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
+!17 = metadata !{metadata !"0x16\00__builtin_va_list\006\000\000\000\000", metadata !1, null, metadata !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
+!18 = metadata !{metadata !"0x13\00__va_list\006\0032\0032\000\000\000", metadata !1, null, null, metadata !19, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
 !19 = metadata !{metadata !20}
-!20 = metadata !{i32 786445, metadata !1, metadata !18, metadata !"__ap", i32 6, i64 32, i64 32, i64 0, i32 0, metadata !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
-!21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
+!20 = metadata !{metadata !"0xd\00__ap\006\0032\0032\000\000", metadata !1, metadata !18, metadata !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
+!21 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
 !22 = metadata !{i32 6, i32 0, metadata !4, null}
 !23 = metadata !{i32 7, i32 0, metadata !4, null}
-!24 = metadata !{i32 786688, metadata !4, metadata !"sum", metadata !5, i32 8, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [sum] [line 8]
-!25 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
-!26 = metadata !{i32 786688, metadata !27, metadata !"i", metadata !5, i32 9, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 9]
-!27 = metadata !{i32 786443, metadata !1, metadata !4, i32 9, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!24 = metadata !{metadata !"0x100\00sum\008\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [sum] [line 8]
+!25 = metadata !{i32 8, i32 0, metadata !4, null}
+!26 = metadata !{metadata !"0x100\00i\009\000", metadata !27, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 9]
+!27 = metadata !{metadata !"0xb\009\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
 !28 = metadata !{i32 9, i32 0, metadata !27, null}
 !29 = metadata !{i32 10, i32 0, metadata !30, null}
-!30 = metadata !{i32 786443, metadata !1, metadata !27, i32 9, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!30 = metadata !{metadata !"0xb\009\000\001", metadata !1, metadata !27} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
 !31 = metadata !{i32 11, i32 0, metadata !30, null}
 !32 = metadata !{i32 12, i32 0, metadata !4, null}
 !33 = metadata !{i32 13, i32 0, metadata !4, null}
diff --git a/test/CodeGen/ARM/debug-frame.ll b/test/CodeGen/ARM/debug-frame.ll
index cb54aa8..c6243ec 100644
--- a/test/CodeGen/ARM/debug-frame.ll
+++ b/test/CodeGen/ARM/debug-frame.ll
@@ -128,37 +128,37 @@ declare void @_ZSt9terminatev()
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/exp.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/exp.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"exp.cpp", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test", metadata !"test", metadata !"_Z4testiiiiiddddd", i32 4, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32, i32, i32, i32, i32, double, double, double, double, double)* @_Z4testiiiiiddddd, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 5] [test]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/exp.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00test\00test\00_Z4testiiiiiddddd\004\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, void (i32, i32, i32, i32, i32, double, double, double, double, double)* @_Z4testiiiiiddddd, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 5] [test]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/exp.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8, metadata !8, metadata !8, metadata !8, metadata !8, metadata !9, metadata !9, metadata !9, metadata !9, metadata !9}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
 !10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !12 = metadata !{metadata !"clang version 3.5 "}
-!13 = metadata !{i32 786689, metadata !4, metadata !"a", metadata !5, i32 16777220, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!13 = metadata !{metadata !"0x101\00a\0016777220\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
 !14 = metadata !{i32 4, i32 0, metadata !4, null}
-!15 = metadata !{i32 786689, metadata !4, metadata !"b", metadata !5, i32 33554436, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 4]
-!16 = metadata !{i32 786689, metadata !4, metadata !"c", metadata !5, i32 50331652, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [c] [line 4]
-!17 = metadata !{i32 786689, metadata !4, metadata !"d", metadata !5, i32 67108868, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [d] [line 4]
-!18 = metadata !{i32 786689, metadata !4, metadata !"e", metadata !5, i32 83886084, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [e] [line 4]
-!19 = metadata !{i32 786689, metadata !4, metadata !"m", metadata !5, i32 100663301, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [m] [line 5]
+!15 = metadata !{metadata !"0x101\00b\0033554436\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [b] [line 4]
+!16 = metadata !{metadata !"0x101\00c\0050331652\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [c] [line 4]
+!17 = metadata !{metadata !"0x101\00d\0067108868\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [d] [line 4]
+!18 = metadata !{metadata !"0x101\00e\0083886084\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [e] [line 4]
+!19 = metadata !{metadata !"0x101\00m\00100663301\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [m] [line 5]
 !20 = metadata !{i32 5, i32 0, metadata !4, null}
-!21 = metadata !{i32 786689, metadata !4, metadata !"n", metadata !5, i32 117440517, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [n] [line 5]
-!22 = metadata !{i32 786689, metadata !4, metadata !"p", metadata !5, i32 134217733, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 5]
-!23 = metadata !{i32 786689, metadata !4, metadata !"q", metadata !5, i32 150994949, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [q] [line 5]
-!24 = metadata !{i32 786689, metadata !4, metadata !"r", metadata !5, i32 167772165, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 5]
+!21 = metadata !{metadata !"0x101\00n\00117440517\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [n] [line 5]
+!22 = metadata !{metadata !"0x101\00p\00134217733\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [p] [line 5]
+!23 = metadata !{metadata !"0x101\00q\00150994949\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [q] [line 5]
+!24 = metadata !{metadata !"0x101\00r\00167772165\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [r] [line 5]
 !25 = metadata !{i32 7, i32 0, metadata !26, null}
-!26 = metadata !{i32 786443, metadata !1, metadata !4, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
-!27 = metadata !{i32 8, i32 0, metadata !26, null} ; [ DW_TAG_imported_declaration ]
+!26 = metadata !{metadata !"0xb\006\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
+!27 = metadata !{i32 8, i32 0, metadata !26, null}
 !28 = metadata !{i32 11, i32 0, metadata !26, null}
 !29 = metadata !{i32 9, i32 0, metadata !30, null}
-!30 = metadata !{i32 786443, metadata !1, metadata !4, i32 8, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
+!30 = metadata !{metadata !"0xb\008\000\001", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
 !31 = metadata !{i32 10, i32 0, metadata !30, null}
 !32 = metadata !{i32 10, i32 0, metadata !4, null}
 !33 = metadata !{i32 11, i32 0, metadata !4, null}
diff --git a/test/CodeGen/ARM/debug-info-arg.ll b/test/CodeGen/ARM/debug-info-arg.ll
index 31d0324..34e9938 100644
--- a/test/CodeGen/ARM/debug-info-arg.ll
+++ b/test/CodeGen/ARM/debug-info-arg.ll
@@ -7,13 +7,13 @@ target triple = "thumbv7-apple-ios"
 %struct.tag_s = type { i32, i32, i32 }
 
 define void @foo(%struct.tag_s* nocapture %this, %struct.tag_s* %c, i64 %x, i64 %y, %struct.tag_s* nocapture %ptr1, %struct.tag_s* nocapture %ptr2) nounwind ssp {
-  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %this}, i64 0, metadata !5), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %c}, i64 0, metadata !13), !dbg !21
-  tail call void @llvm.dbg.value(metadata !{i64 %x}, i64 0, metadata !14), !dbg !22
-  tail call void @llvm.dbg.value(metadata !{i64 %y}, i64 0, metadata !17), !dbg !23
+  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %this}, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %c}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{i64 %x}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !22
+  tail call void @llvm.dbg.value(metadata !{i64 %y}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !23
 ;CHECK:	@DEBUG_VALUE: foo:y <- [R7+8]
-  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %ptr1}, i64 0, metadata !18), !dbg !24
-  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %ptr2}, i64 0, metadata !19), !dbg !25
+  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %ptr1}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !24
+  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %ptr2}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !25
   %1 = icmp eq %struct.tag_s* %c, null, !dbg !26
   br i1 %1, label %3, label %2, !dbg !26
 
@@ -27,31 +27,31 @@ define void @foo(%struct.tag_s* nocapture %this, %struct.tag_s* %c, i64 %x, i64
 
 declare void @foobar(i64, i64)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{i32 786449, metadata !32, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !30, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !2, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 11, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%struct.tag_s*, %struct.tag_s*, i64, i64, %struct.tag_s*, %struct.tag_s*)* @foo, null, null, metadata !31, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [foo]
-!2 = metadata !{i32 786473, metadata !32} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !32, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)\001\00\000\00\001", metadata !32, metadata !4, metadata !4, metadata !30, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00\0011\000\001\000\006\00256\001\0011", metadata !2, metadata !2, metadata !3, null, void (%struct.tag_s*, %struct.tag_s*, i64, i64, %struct.tag_s*, %struct.tag_s*)* @foo, null, null, metadata !31} ; [ DW_TAG_subprogram ] [line 11] [def] [foo]
+!2 = metadata !{metadata !"0x29", metadata !32} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !32, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
-!5 = metadata !{i32 786689, metadata !1, metadata !"this", metadata !2, i32 16777227, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!6 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 786451, metadata !32, metadata !0, metadata !"tag_s", i32 5, i64 96, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [tag_s] [line 5, size 96, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x101\00this\0016777227\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!6 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !7} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0x13\00tag_s\005\0096\0032\000\000\000", metadata !32, metadata !0, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [tag_s] [line 5, size 96, align 32, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !11, metadata !12}
-!9 = metadata !{i32 786445, metadata !32, metadata !7, metadata !"x", i32 6, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 786445, metadata !32, metadata !7, metadata !"y", i32 7, i64 32, i64 32, i64 32, i32 0, metadata !10} ; [ DW_TAG_member ]
-!12 = metadata !{i32 786445, metadata !32, metadata !7, metadata !"z", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !10} ; [ DW_TAG_member ]
-!13 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 33554443, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!14 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !2, i32 50331659, metadata !15, i32 0, null} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 786454, metadata !32, metadata !0, metadata !"UInt64", i32 1, i64 0, i64 0, i64 0, i32 0, metadata !16} ; [ DW_TAG_typedef ]
-!16 = metadata !{i32 786468, null, metadata !0, metadata !"long long unsigned int", i32 0, i64 64, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!17 = metadata !{i32 786689, metadata !1, metadata !"y", metadata !2, i32 67108875, metadata !15, i32 0, null} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{i32 786689, metadata !1, metadata !"ptr1", metadata !2, i32 83886091, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786689, metadata !1, metadata !"ptr2", metadata !2, i32 100663307, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0xd\00x\006\0032\0032\000\000", metadata !32, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !"0xd\00y\007\0032\0032\0032\000", metadata !32, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!12 = metadata !{metadata !"0xd\00z\008\0032\0032\0064\000", metadata !32, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!13 = metadata !{metadata !"0x101\00c\0033554443\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!14 = metadata !{metadata !"0x101\00x\0050331659\000", metadata !1, metadata !2, metadata !15} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{metadata !"0x16\00UInt64\001\000\000\000\000", metadata !32, metadata !0, metadata !16} ; [ DW_TAG_typedef ]
+!16 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0032\000\000\007", null, metadata !0} ; [ DW_TAG_base_type ]
+!17 = metadata !{metadata !"0x101\00y\0067108875\000", metadata !1, metadata !2, metadata !15} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{metadata !"0x101\00ptr1\0083886091\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x101\00ptr2\00100663307\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
 !20 = metadata !{i32 11, i32 24, metadata !1, null}
 !21 = metadata !{i32 11, i32 44, metadata !1, null}
 !22 = metadata !{i32 11, i32 54, metadata !1, null}
@@ -59,10 +59,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !24 = metadata !{i32 11, i32 81, metadata !1, null}
 !25 = metadata !{i32 11, i32 101, metadata !1, null}
 !26 = metadata !{i32 12, i32 3, metadata !27, null}
-!27 = metadata !{i32 786443, metadata !2, metadata !1, i32 11, i32 107, i32 0} ; [ DW_TAG_lexical_block ]
+!27 = metadata !{metadata !"0xb\0011\00107\000", metadata !2, metadata !1} ; [ DW_TAG_lexical_block ]
 !28 = metadata !{i32 13, i32 5, metadata !27, null}
 !29 = metadata !{i32 14, i32 1, metadata !27, null}
 !30 = metadata !{metadata !1}
 !31 = metadata !{metadata !5, metadata !13, metadata !14, metadata !17, metadata !18, metadata!19}
 !32 = metadata !{metadata !"one.c", metadata !"/Volumes/Athwagate/R10048772"}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/debug-info-blocks.ll b/test/CodeGen/ARM/debug-info-blocks.ll
index 5ad5e59..3623927 100644
--- a/test/CodeGen/ARM/debug-info-blocks.ll
+++ b/test/CodeGen/ARM/debug-info-blocks.ll
@@ -19,11 +19,11 @@ target triple = "thumbv7-apple-ios"
 @"OBJC_IVAR_$_MyWork._data" = external hidden global i32, section "__DATA, __objc_const", align 4
 @"\01L_OBJC_SELECTOR_REFERENCES_222" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i8* @objc_msgSend(i8*, i8*, ...)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 
@@ -31,22 +31,22 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
   %1 = alloca %0*, align 4
   %bounds = alloca %struct.CR, align 4
   %data = alloca %struct.CR, align 4
-  call void @llvm.dbg.value(metadata !{i8* %.block_descriptor}, i64 0, metadata !27), !dbg !129
+  call void @llvm.dbg.value(metadata !{i8* %.block_descriptor}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !129
   store %0* %loadedMydata, %0** %1, align 4
-  call void @llvm.dbg.declare(metadata !{%0** %1}, metadata !130), !dbg !131
+  call void @llvm.dbg.declare(metadata !{%0** %1}, metadata !130, metadata !{metadata !"0x102"}), !dbg !131
   %2 = bitcast %struct.CR* %bounds to %1*
   %3 = getelementptr %1* %2, i32 0, i32 0
   store [4 x i32] %bounds.coerce0, [4 x i32]* %3
-  call void @llvm.dbg.declare(metadata !{%struct.CR* %bounds}, metadata !132), !dbg !133
+  call void @llvm.dbg.declare(metadata !{%struct.CR* %bounds}, metadata !132, metadata !{metadata !"0x102"}), !dbg !133
   %4 = bitcast %struct.CR* %data to %1*
   %5 = getelementptr %1* %4, i32 0, i32 0
   store [4 x i32] %data.coerce0, [4 x i32]* %5
-  call void @llvm.dbg.declare(metadata !{%struct.CR* %data}, metadata !134), !dbg !135
+  call void @llvm.dbg.declare(metadata !{%struct.CR* %data}, metadata !134, metadata !{metadata !"0x102"}), !dbg !135
   %6 = bitcast i8* %.block_descriptor to %2*
   %7 = getelementptr inbounds %2* %6, i32 0, i32 6
-  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !136), !dbg !137
-  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !138), !dbg !137
-  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !139), !dbg !140
+  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !136, metadata !163), !dbg !137
+  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !138, metadata !164), !dbg !137
+  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !139, metadata !165), !dbg !140
   %8 = load %0** %1, align 4, !dbg !141
   %9 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_13", !dbg !141
   %10 = bitcast %0* %8 to i8*, !dbg !141
@@ -95,149 +95,149 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!162}
 
-!0 = metadata !{i32 786449, metadata !153, i32 16, metadata !"Apple clang version 2.1", i1 false, metadata !"", i32 2, metadata !147, metadata !26, metadata !148, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786436, metadata !160, metadata !0, metadata !"", i32 248, i64 32, i64 32, i32 0, i32 0, null, metadata !3, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 248, size 32, align 32, offset 0] [def] [from ]
-!2 = metadata !{i32 786473, metadata !160} ; [ DW_TAG_file_type ]
+!0 = metadata !{metadata !"0x11\0016\00Apple clang version 2.1\000\00\002\00\001", metadata !153, metadata !147, metadata !26, metadata !148, null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x4\00\00248\0032\0032\000\000\000", metadata !160, metadata !0, null, metadata !3, null, null, null} ; [ DW_TAG_enumeration_type ] [line 248, size 32, align 32, offset 0] [def] [from ]
+!2 = metadata !{metadata !"0x29", metadata !160} ; [ DW_TAG_file_type ]
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786472, metadata !"Ver1", i64 0} ; [ DW_TAG_enumerator ]
-!5 = metadata !{i32 786436, metadata !160, metadata !0, metadata !"Mode", i32 79, i64 32, i64 32, i32 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [Mode] [line 79, size 32, align 32, offset 0] [def] [from ]
-!6 = metadata !{i32 786473, metadata !161} ; [ DW_TAG_file_type ]
+!4 = metadata !{metadata !"0x28\00Ver1\000"} ; [ DW_TAG_enumerator ]
+!5 = metadata !{metadata !"0x4\00Mode\0079\0032\0032\000\000\000", metadata !160, metadata !0, null, metadata !7, null, null, null} ; [ DW_TAG_enumeration_type ] [Mode] [line 79, size 32, align 32, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x29", metadata !161} ; [ DW_TAG_file_type ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786472, metadata !"One", i64 0} ; [ DW_TAG_enumerator ]
-!9 = metadata !{i32 786436, metadata !149, metadata !0, metadata !"", i32 15, i64 32, i64 32, i32 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 15, size 32, align 32, offset 0] [def] [from ]
-!10 = metadata !{i32 786473, metadata !149} ; [ DW_TAG_file_type ]
+!8 = metadata !{metadata !"0x28\00One\000"} ; [ DW_TAG_enumerator ]
+!9 = metadata !{metadata !"0x4\00\0015\0032\0032\000\000\000", metadata !149, metadata !0, null, metadata !11, null, null, null} ; [ DW_TAG_enumeration_type ] [line 15, size 32, align 32, offset 0] [def] [from ]
+!10 = metadata !{metadata !"0x29", metadata !149} ; [ DW_TAG_file_type ]
 !11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{i32 786472, metadata !"Unknown", i64 0} ; [ DW_TAG_enumerator ]
-!13 = metadata !{i32 786472, metadata !"Known", i64 1} ; [ DW_TAG_enumerator ]
-!14 = metadata !{i32 786436, metadata !150, metadata !0, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
-!15 = metadata !{i32 786473, metadata !150} ; [ DW_TAG_file_type ]
+!12 = metadata !{metadata !"0x28\00Unknown\000"} ; [ DW_TAG_enumerator ]
+!13 = metadata !{metadata !"0x28\00Known\001"} ; [ DW_TAG_enumerator ]
+!14 = metadata !{metadata !"0x4\00\0020\0032\0032\000\000\000", metadata !150, metadata !0, null, metadata !16, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
+!15 = metadata !{metadata !"0x29", metadata !150} ; [ DW_TAG_file_type ]
 !16 = metadata !{metadata !17, metadata !18}
-!17 = metadata !{i32 786472, metadata !"Single", i64 0} ; [ DW_TAG_enumerator ]
-!18 = metadata !{i32 786472, metadata !"Double", i64 1} ; [ DW_TAG_enumerator ]
-!19 = metadata !{i32 786436, metadata !151, metadata !0, metadata !"", i32 14, i64 32, i64 32, i32 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 14, size 32, align 32, offset 0] [def] [from ]
-!20 = metadata !{i32 786473, metadata !151} ; [ DW_TAG_file_type ]
+!17 = metadata !{metadata !"0x28\00Single\000"} ; [ DW_TAG_enumerator ]
+!18 = metadata !{metadata !"0x28\00Double\001"} ; [ DW_TAG_enumerator ]
+!19 = metadata !{metadata !"0x4\00\0014\0032\0032\000\000\000", metadata !151, metadata !0, null, metadata !21, null, null, null} ; [ DW_TAG_enumeration_type ] [line 14, size 32, align 32, offset 0] [def] [from ]
+!20 = metadata !{metadata !"0x29", metadata !151} ; [ DW_TAG_file_type ]
 !21 = metadata !{metadata !22}
-!22 = metadata !{i32 786472, metadata !"Eleven", i64 0} ; [ DW_TAG_enumerator ]
-!23 = metadata !{i32 786478, metadata !152, metadata !24, metadata !"foobar_func_block_invoke_0", metadata !"foobar_func_block_invoke_0", metadata !"", i32 609, metadata !25, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*, %0*, [4 x i32], [4 x i32])* @foobar_func_block_invoke_0, null, null, null, i32 609} ; [ DW_TAG_subprogram ] [line 609] [local] [def] [foobar_func_block_invoke_0]
-!24 = metadata !{i32 786473, metadata !152} ; [ DW_TAG_file_type ]
-!25 = metadata !{i32 786453, metadata !152, metadata !24, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !26, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{metadata !"0x28\00Eleven\000"} ; [ DW_TAG_enumerator ]
+!23 = metadata !{metadata !"0x2e\00foobar_func_block_invoke_0\00foobar_func_block_invoke_0\00\00609\001\001\000\006\00256\000\00609", metadata !152, metadata !24, metadata !25, null, void (i8*, %0*, [4 x i32], [4 x i32])* @foobar_func_block_invoke_0, null, null, null} ; [ DW_TAG_subprogram ] [line 609] [local] [def] [foobar_func_block_invoke_0]
+!24 = metadata !{metadata !"0x29", metadata !152} ; [ DW_TAG_file_type ]
+!25 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !152, metadata !24, null, metadata !26, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !26 = metadata !{null}
-!27 = metadata !{i32 786689, metadata !23, metadata !".block_descriptor", metadata !24, i32 16777825, metadata !28, i32 64, null} ; [ DW_TAG_arg_variable ]
-!28 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 0, i64 0, i32 0, metadata !29} ; [ DW_TAG_pointer_type ]
-!29 = metadata !{i32 786451, metadata !152, metadata !24, metadata !"__block_literal_14", i32 609, i64 256, i64 32, i32 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_14] [line 609, size 256, align 32, offset 0] [def] [from ]
+!27 = metadata !{metadata !"0x101\00.block_descriptor\0016777825\0064", metadata !23, metadata !24, metadata !28} ; [ DW_TAG_arg_variable ]
+!28 = metadata !{metadata !"0xf\00\000\0032\000\000\000", null, metadata !0, metadata !29} ; [ DW_TAG_pointer_type ]
+!29 = metadata !{metadata !"0x13\00__block_literal_14\00609\00256\0032\000\000\000", metadata !152, metadata !24, null, metadata !30, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_14] [line 609, size 256, align 32, offset 0] [def] [from ]
 !30 = metadata !{metadata !31, metadata !33, metadata !35, metadata !36, metadata !37, metadata !48, metadata !89, metadata !124}
-!31 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__isa", i32 609, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_member ]
-!32 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!33 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__flags", i32 609, i64 32, i64 32, i64 32, i32 0, metadata !34} ; [ DW_TAG_member ]
-!34 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!35 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__reserved", i32 609, i64 32, i64 32, i64 64, i32 0, metadata !34} ; [ DW_TAG_member ]
-!36 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__FuncPtr", i32 609, i64 32, i64 32, i64 96, i32 0, metadata !32} ; [ DW_TAG_member ]
-!37 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__descriptor", i32 609, i64 32, i64 32, i64 128, i32 0, metadata !38} ; [ DW_TAG_member ]
-!38 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !39} ; [ DW_TAG_pointer_type ]
-!39 = metadata !{i32 786451, metadata !153, metadata !0, metadata !"__block_descriptor_withcopydispose", i32 307, i64 128, i64 32, i32 0, i32 0, null, metadata !41, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 307, size 128, align 32, offset 0] [def] [from ]
-!40 = metadata !{i32 786473, metadata !153} ; [ DW_TAG_file_type ]
+!31 = metadata !{metadata !"0xd\00__isa\00609\0032\0032\000\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
+!32 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, null} ; [ DW_TAG_pointer_type ]
+!33 = metadata !{metadata !"0xd\00__flags\00609\0032\0032\0032\000", metadata !152, metadata !24, metadata !34} ; [ DW_TAG_member ]
+!34 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!35 = metadata !{metadata !"0xd\00__reserved\00609\0032\0032\0064\000", metadata !152, metadata !24, metadata !34} ; [ DW_TAG_member ]
+!36 = metadata !{metadata !"0xd\00__FuncPtr\00609\0032\0032\0096\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
+!37 = metadata !{metadata !"0xd\00__descriptor\00609\0032\0032\00128\000", metadata !152, metadata !24, metadata !38} ; [ DW_TAG_member ]
+!38 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !39} ; [ DW_TAG_pointer_type ]
+!39 = metadata !{metadata !"0x13\00__block_descriptor_withcopydispose\00307\00128\0032\000\000\000", metadata !153, metadata !0, null, metadata !41, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 307, size 128, align 32, offset 0] [def] [from ]
+!40 = metadata !{metadata !"0x29", metadata !153} ; [ DW_TAG_file_type ]
 !41 = metadata !{metadata !42, metadata !44, metadata !45, metadata !47}
-!42 = metadata !{i32 786445, metadata !153, metadata !40, metadata !"reserved", i32 307, i64 32, i64 32, i64 0, i32 0, metadata !43} ; [ DW_TAG_member ]
-!43 = metadata !{i32 786468, null, metadata !0, metadata !"long unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!44 = metadata !{i32 786445, metadata !153, metadata !40, metadata !"Size", i32 307, i64 32, i64 32, i64 32, i32 0, metadata !43} ; [ DW_TAG_member ]
-!45 = metadata !{i32 786445, metadata !153, metadata !40, metadata !"CopyFuncPtr", i32 307, i64 32, i64 32, i64 64, i32 0, metadata !46} ; [ DW_TAG_member ]
-!46 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_pointer_type ]
-!47 = metadata !{i32 786445, metadata !153, metadata !40, metadata !"DestroyFuncPtr", i32 307, i64 32, i64 32, i64 96, i32 0, metadata !46} ; [ DW_TAG_member ]
-!48 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"mydata", i32 609, i64 32, i64 32, i64 160, i32 0, metadata !49} ; [ DW_TAG_member ]
-!49 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 0, i64 0, i32 0, metadata !50} ; [ DW_TAG_pointer_type ]
-!50 = metadata !{i32 786451, metadata !152, metadata !24, metadata !"", i32 0, i64 224, i64 0, i32 0, i32 16, null, metadata !51, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 0, size 224, align 0, offset 0] [def] [from ]
+!42 = metadata !{metadata !"0xd\00reserved\00307\0032\0032\000\000", metadata !153, metadata !40, metadata !43} ; [ DW_TAG_member ]
+!43 = metadata !{metadata !"0x24\00long unsigned int\000\0032\0032\000\000\007", null, metadata !0} ; [ DW_TAG_base_type ]
+!44 = metadata !{metadata !"0xd\00Size\00307\0032\0032\0032\000", metadata !153, metadata !40, metadata !43} ; [ DW_TAG_member ]
+!45 = metadata !{metadata !"0xd\00CopyFuncPtr\00307\0032\0032\0064\000", metadata !153, metadata !40, metadata !46} ; [ DW_TAG_member ]
+!46 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !32} ; [ DW_TAG_pointer_type ]
+!47 = metadata !{metadata !"0xd\00DestroyFuncPtr\00307\0032\0032\0096\000", metadata !153, metadata !40, metadata !46} ; [ DW_TAG_member ]
+!48 = metadata !{metadata !"0xd\00mydata\00609\0032\0032\00160\000", metadata !152, metadata !24, metadata !49} ; [ DW_TAG_member ]
+!49 = metadata !{metadata !"0xf\00\000\0032\000\000\000", null, metadata !0, metadata !50} ; [ DW_TAG_pointer_type ]
+!50 = metadata !{metadata !"0x13\00\000\00224\000\000\0016\000", metadata !152, metadata !24, null, metadata !51, null, null, null} ; [ DW_TAG_structure_type ] [line 0, size 224, align 0, offset 0] [def] [from ]
 !51 = metadata !{metadata !52, metadata !53, metadata !54, metadata !55, metadata !56, metadata !57, metadata !58}
-!52 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__isa", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_member ]
-!53 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__forwarding", i32 0, i64 32, i64 32, i64 32, i32 0, metadata !32} ; [ DW_TAG_member ]
-!54 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__flags", i32 0, i64 32, i64 32, i64 64, i32 0, metadata !34} ; [ DW_TAG_member ]
-!55 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__size", i32 0, i64 32, i64 32, i64 96, i32 0, metadata !34} ; [ DW_TAG_member ]
-!56 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__copy_helper", i32 0, i64 32, i64 32, i64 128, i32 0, metadata !32} ; [ DW_TAG_member ]
-!57 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__destroy_helper", i32 0, i64 32, i64 32, i64 160, i32 0, metadata !32} ; [ DW_TAG_member ]
-!58 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"mydata", i32 0, i64 32, i64 32, i64 192, i32 0, metadata !59} ; [ DW_TAG_member ]
-!59 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !60} ; [ DW_TAG_pointer_type ]
-!60 = metadata !{i32 786451, metadata !154, metadata !24, metadata !"UIMydata", i32 26, i64 128, i64 32, i32 0, i32 0, null, metadata !62, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [UIMydata] [line 26, size 128, align 32, offset 0] [def] [from ]
-!61 = metadata !{i32 786473, metadata !154} ; [ DW_TAG_file_type ]
+!52 = metadata !{metadata !"0xd\00__isa\000\0032\0032\000\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
+!53 = metadata !{metadata !"0xd\00__forwarding\000\0032\0032\0032\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
+!54 = metadata !{metadata !"0xd\00__flags\000\0032\0032\0064\000", metadata !152, metadata !24, metadata !34} ; [ DW_TAG_member ]
+!55 = metadata !{metadata !"0xd\00__size\000\0032\0032\0096\000", metadata !152, metadata !24, metadata !34} ; [ DW_TAG_member ]
+!56 = metadata !{metadata !"0xd\00__copy_helper\000\0032\0032\00128\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
+!57 = metadata !{metadata !"0xd\00__destroy_helper\000\0032\0032\00160\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
+!58 = metadata !{metadata !"0xd\00mydata\000\0032\0032\00192\000", metadata !152, metadata !24, metadata !59} ; [ DW_TAG_member ]
+!59 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !60} ; [ DW_TAG_pointer_type ]
+!60 = metadata !{metadata !"0x13\00UIMydata\0026\00128\0032\000\000\0016", metadata !154, metadata !24, null, metadata !62, null, null, null} ; [ DW_TAG_structure_type ] [UIMydata] [line 26, size 128, align 32, offset 0] [def] [from ]
+!61 = metadata !{metadata !"0x29", metadata !154} ; [ DW_TAG_file_type ]
 !62 = metadata !{metadata !63, metadata !71, metadata !75, metadata !79}
-!63 = metadata !{i32 786460, metadata !60, null, metadata !61, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !64} ; [ DW_TAG_inheritance ]
-!64 = metadata !{i32 786451, metadata !155, metadata !40, metadata !"NSO", i32 66, i64 32, i64 32, i32 0, i32 0, null, metadata !66, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [NSO] [line 66, size 32, align 32, offset 0] [def] [from ]
-!65 = metadata !{i32 786473, metadata !155} ; [ DW_TAG_file_type ]
+!63 = metadata !{metadata !"0x1c\00\000\000\000\000\000", metadata !60, null, metadata !64} ; [ DW_TAG_inheritance ]
+!64 = metadata !{metadata !"0x13\00NSO\0066\0032\0032\000\000\0016", metadata !155, metadata !40, null, metadata !66, null, null, null} ; [ DW_TAG_structure_type ] [NSO] [line 66, size 32, align 32, offset 0] [def] [from ]
+!65 = metadata !{metadata !"0x29", metadata !155} ; [ DW_TAG_file_type ]
 !66 = metadata !{metadata !67}
-!67 = metadata !{i32 786445, metadata !155, metadata !65, metadata !"isa", i32 67, i64 32, i64 32, i64 0, i32 2, metadata !68, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!68 = metadata !{i32 786454, metadata !153, metadata !0, metadata !"Class", i32 197, i64 0, i64 0, i64 0, i32 0, metadata !69} ; [ DW_TAG_typedef ]
-!69 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !70} ; [ DW_TAG_pointer_type ]
-!70 = metadata !{i32 786451, metadata !153, metadata !0, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!71 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"_mydataRef", i32 28, i64 32, i64 32, i64 32, i32 0, metadata !72, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!72 = metadata !{i32 786454, metadata !152, metadata !0, metadata !"CFTypeRef", i32 313, i64 0, i64 0, i64 0, i32 0, metadata !73} ; [ DW_TAG_typedef ]
-!73 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !74} ; [ DW_TAG_pointer_type ]
-!74 = metadata !{i32 786470, null, metadata !0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_const_type ]
-!75 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"_scale", i32 29, i64 32, i64 32, i64 64, i32 0, metadata !76, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!76 = metadata !{i32 786454, metadata !156, metadata !0, metadata !"Float", i32 89, i64 0, i64 0, i64 0, i32 0, metadata !78} ; [ DW_TAG_typedef ]
-!77 = metadata !{i32 786473, metadata !156} ; [ DW_TAG_file_type ]
-!78 = metadata !{i32 786468, null, metadata !0, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!79 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"_mydataFlags", i32 37, i64 8, i64 8, i64 96, i32 0, metadata !80, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!80 = metadata !{i32 786451, metadata !154, metadata !0, metadata !"", i32 30, i64 8, i64 8, i32 0, i32 0, null, metadata !81, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 30, size 8, align 8, offset 0] [def] [from ]
+!67 = metadata !{metadata !"0xd\00isa\0067\0032\0032\000\002", metadata !155, metadata !65, metadata !68, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!68 = metadata !{metadata !"0x16\00Class\00197\000\000\000\000", metadata !153, metadata !0, metadata !69} ; [ DW_TAG_typedef ]
+!69 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !70} ; [ DW_TAG_pointer_type ]
+!70 = metadata !{metadata !"0x13\00objc_class\000\000\000\000\004\000", metadata !153, metadata !0, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!71 = metadata !{metadata !"0xd\00_mydataRef\0028\0032\0032\0032\000", metadata !154, metadata !61, metadata !72, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!72 = metadata !{metadata !"0x16\00CFTypeRef\00313\000\000\000\000", metadata !152, metadata !0, metadata !73} ; [ DW_TAG_typedef ]
+!73 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !74} ; [ DW_TAG_pointer_type ]
+!74 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, metadata !0, null} ; [ DW_TAG_const_type ]
+!75 = metadata !{metadata !"0xd\00_scale\0029\0032\0032\0064\000", metadata !154, metadata !61, metadata !76, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!76 = metadata !{metadata !"0x16\00Float\0089\000\000\000\000", metadata !156, metadata !0, metadata !78} ; [ DW_TAG_typedef ]
+!77 = metadata !{metadata !"0x29", metadata !156} ; [ DW_TAG_file_type ]
+!78 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !0} ; [ DW_TAG_base_type ]
+!79 = metadata !{metadata !"0xd\00_mydataFlags\0037\008\008\0096\000", metadata !154, metadata !61, metadata !80, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!80 = metadata !{metadata !"0x13\00\0030\008\008\000\000\000", metadata !154, metadata !0, null, metadata !81, null, null, null} ; [ DW_TAG_structure_type ] [line 30, size 8, align 8, offset 0] [def] [from ]
 !81 = metadata !{metadata !82, metadata !84, metadata !85, metadata !86, metadata !87, metadata !88}
-!82 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"named", i32 31, i64 1, i64 32, i64 0, i32 0, metadata !83} ; [ DW_TAG_member ]
-!83 = metadata !{i32 786468, null, metadata !0, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!84 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"mydataO", i32 32, i64 3, i64 32, i64 1, i32 0, metadata !83} ; [ DW_TAG_member ]
-!85 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"cached", i32 33, i64 1, i64 32, i64 4, i32 0, metadata !83} ; [ DW_TAG_member ]
-!86 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"hasBeenCached", i32 34, i64 1, i64 32, i64 5, i32 0, metadata !83} ; [ DW_TAG_member ]
-!87 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"hasPattern", i32 35, i64 1, i64 32, i64 6, i32 0, metadata !83} ; [ DW_TAG_member ]
-!88 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"isCIMydata", i32 36, i64 1, i64 32, i64 7, i32 0, metadata !83} ; [ DW_TAG_member ]
-!89 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"self", i32 609, i64 32, i64 32, i64 192, i32 0, metadata !90} ; [ DW_TAG_member ]
-!90 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !91} ; [ DW_TAG_pointer_type ]
-!91 = metadata !{i32 786451, metadata !152, metadata !40, metadata !"MyWork", i32 36, i64 384, i64 32, i32 0, i32 0, null, metadata !92, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [MyWork] [line 36, size 384, align 32, offset 0] [def] [from ]
+!82 = metadata !{metadata !"0xd\00named\0031\001\0032\000\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
+!83 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !0} ; [ DW_TAG_base_type ]
+!84 = metadata !{metadata !"0xd\00mydataO\0032\003\0032\001\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
+!85 = metadata !{metadata !"0xd\00cached\0033\001\0032\004\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
+!86 = metadata !{metadata !"0xd\00hasBeenCached\0034\001\0032\005\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
+!87 = metadata !{metadata !"0xd\00hasPattern\0035\001\0032\006\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
+!88 = metadata !{metadata !"0xd\00isCIMydata\0036\001\0032\007\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
+!89 = metadata !{metadata !"0xd\00self\00609\0032\0032\00192\000", metadata !152, metadata !24, metadata !90} ; [ DW_TAG_member ]
+!90 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !91} ; [ DW_TAG_pointer_type ]
+!91 = metadata !{metadata !"0x13\00MyWork\0036\00384\0032\000\000\0016", metadata !152, metadata !40, null, metadata !92, null, null, null} ; [ DW_TAG_structure_type ] [MyWork] [line 36, size 384, align 32, offset 0] [def] [from ]
 !92 = metadata !{metadata !93, metadata !98, metadata !101, metadata !107, metadata !123}
-!93 = metadata !{i32 786460, metadata !152, metadata !91, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !94} ; [ DW_TAG_inheritance ]
-!94 = metadata !{i32 786451, metadata !157, metadata !40, metadata !"twork", i32 43, i64 32, i64 32, i32 0, i32 0, null, metadata !96, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [twork] [line 43, size 32, align 32, offset 0] [def] [from ]
-!95 = metadata !{i32 786473, metadata !157} ; [ DW_TAG_file_type ]
+!93 = metadata !{metadata !"0x1c\00\000\000\000\000\000", metadata !152, metadata !91, metadata !94} ; [ DW_TAG_inheritance ]
+!94 = metadata !{metadata !"0x13\00twork\0043\0032\0032\000\000\0016", metadata !157, metadata !40, null, metadata !96, null, null, null} ; [ DW_TAG_structure_type ] [twork] [line 43, size 32, align 32, offset 0] [def] [from ]
+!95 = metadata !{metadata !"0x29", metadata !157} ; [ DW_TAG_file_type ]
 !96 = metadata !{metadata !97}
-!97 = metadata !{i32 786460, metadata !94, null, metadata !95, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !64} ; [ DW_TAG_inheritance ]
-!98 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"_itemID", i32 38, i64 64, i64 32, i64 32, i32 1, metadata !99, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!99 = metadata !{i32 786454, metadata !153, metadata !0, metadata !"uint64_t", i32 55, i64 0, i64 0, i64 0, i32 0, metadata !100} ; [ DW_TAG_typedef ]
-!100 = metadata !{i32 786468, null, metadata !0, metadata !"long long unsigned int", i32 0, i64 64, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!101 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"_library", i32 39, i64 32, i64 32, i64 96, i32 1, metadata !102, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!102 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !103} ; [ DW_TAG_pointer_type ]
-!103 = metadata !{i32 786451, metadata !158, metadata !40, metadata !"MyLibrary2", i32 22, i64 32, i64 32, i32 0, i32 0, null, metadata !105, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [MyLibrary2] [line 22, size 32, align 32, offset 0] [def] [from ]
-!104 = metadata !{i32 786473, metadata !158} ; [ DW_TAG_file_type ]
+!97 = metadata !{metadata !"0x1c\00\000\000\000\000\000", metadata !94, null, metadata !64} ; [ DW_TAG_inheritance ]
+!98 = metadata !{metadata !"0xd\00_itemID\0038\0064\0032\0032\001", metadata !152, metadata !24, metadata !99, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!99 = metadata !{metadata !"0x16\00uint64_t\0055\000\000\000\000", metadata !153, metadata !0, metadata !100} ; [ DW_TAG_typedef ]
+!100 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0032\000\000\007", null, metadata !0} ; [ DW_TAG_base_type ]
+!101 = metadata !{metadata !"0xd\00_library\0039\0032\0032\0096\001", metadata !152, metadata !24, metadata !102, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!102 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !103} ; [ DW_TAG_pointer_type ]
+!103 = metadata !{metadata !"0x13\00MyLibrary2\0022\0032\0032\000\000\0016", metadata !158, metadata !40, null, metadata !105, null, null, null} ; [ DW_TAG_structure_type ] [MyLibrary2] [line 22, size 32, align 32, offset 0] [def] [from ]
+!104 = metadata !{metadata !"0x29", metadata !158} ; [ DW_TAG_file_type ]
 !105 = metadata !{metadata !106}
-!106 = metadata !{i32 786460, metadata !103, null, metadata !104, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !64} ; [ DW_TAG_inheritance ]
-!107 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"_bounds", i32 40, i64 128, i64 32, i64 128, i32 1, metadata !108, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!108 = metadata !{i32 786454, metadata !153, metadata !0, metadata !"CR", i32 33, i64 0, i64 0, i64 0, i32 0, metadata !109} ; [ DW_TAG_typedef ]
-!109 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"CR", i32 29, i64 128, i64 32, i32 0, i32 0, null, metadata !110, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [CR] [line 29, size 128, align 32, offset 0] [def] [from ]
+!106 = metadata !{metadata !"0x1c\00\000\000\000\000\000", metadata !103, null, metadata !64} ; [ DW_TAG_inheritance ]
+!107 = metadata !{metadata !"0xd\00_bounds\0040\00128\0032\00128\001", metadata !152, metadata !24, metadata !108, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!108 = metadata !{metadata !"0x16\00CR\0033\000\000\000\000", metadata !153, metadata !0, metadata !109} ; [ DW_TAG_typedef ]
+!109 = metadata !{metadata !"0x13\00CR\0029\00128\0032\000\000\000", metadata !156, metadata !0, null, metadata !110, null, null, null} ; [ DW_TAG_structure_type ] [CR] [line 29, size 128, align 32, offset 0] [def] [from ]
 !110 = metadata !{metadata !111, metadata !117}
-!111 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"origin", i32 30, i64 64, i64 32, i64 0, i32 0, metadata !112} ; [ DW_TAG_member ]
-!112 = metadata !{i32 786454, metadata !156, metadata !0, metadata !"CP", i32 17, i64 0, i64 0, i64 0, i32 0, metadata !113} ; [ DW_TAG_typedef ]
-!113 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"CP", i32 13, i64 64, i64 32, i32 0, i32 0, null, metadata !114, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [CP] [line 13, size 64, align 32, offset 0] [def] [from ]
+!111 = metadata !{metadata !"0xd\00origin\0030\0064\0032\000\000", metadata !156, metadata !77, metadata !112} ; [ DW_TAG_member ]
+!112 = metadata !{metadata !"0x16\00CP\0017\000\000\000\000", metadata !156, metadata !0, metadata !113} ; [ DW_TAG_typedef ]
+!113 = metadata !{metadata !"0x13\00CP\0013\0064\0032\000\000\000", metadata !156, metadata !0, null, metadata !114, null, null, null} ; [ DW_TAG_structure_type ] [CP] [line 13, size 64, align 32, offset 0] [def] [from ]
 !114 = metadata !{metadata !115, metadata !116}
-!115 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"x", i32 14, i64 32, i64 32, i64 0, i32 0, metadata !76} ; [ DW_TAG_member ]
-!116 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"y", i32 15, i64 32, i64 32, i64 32, i32 0, metadata !76} ; [ DW_TAG_member ]
-!117 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"size", i32 31, i64 64, i64 32, i64 64, i32 0, metadata !118} ; [ DW_TAG_member ]
-!118 = metadata !{i32 786454, metadata !156, metadata !0, metadata !"Size", i32 25, i64 0, i64 0, i64 0, i32 0, metadata !119} ; [ DW_TAG_typedef ]
-!119 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"Size", i32 21, i64 64, i64 32, i32 0, i32 0, null, metadata !120, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Size] [line 21, size 64, align 32, offset 0] [def] [from ]
+!115 = metadata !{metadata !"0xd\00x\0014\0032\0032\000\000", metadata !156, metadata !77, metadata !76} ; [ DW_TAG_member ]
+!116 = metadata !{metadata !"0xd\00y\0015\0032\0032\0032\000", metadata !156, metadata !77, metadata !76} ; [ DW_TAG_member ]
+!117 = metadata !{metadata !"0xd\00size\0031\0064\0032\0064\000", metadata !156, metadata !77, metadata !118} ; [ DW_TAG_member ]
+!118 = metadata !{metadata !"0x16\00Size\0025\000\000\000\000", metadata !156, metadata !0, metadata !119} ; [ DW_TAG_typedef ]
+!119 = metadata !{metadata !"0x13\00Size\0021\0064\0032\000\000\000", metadata !156, metadata !0, null, metadata !120, null, null, null} ; [ DW_TAG_structure_type ] [Size] [line 21, size 64, align 32, offset 0] [def] [from ]
 !120 = metadata !{metadata !121, metadata !122}
-!121 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"width", i32 22, i64 32, i64 32, i64 0, i32 0, metadata !76} ; [ DW_TAG_member ]
-!122 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"height", i32 23, i64 32, i64 32, i64 32, i32 0, metadata !76} ; [ DW_TAG_member ]
-!123 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"_data", i32 40, i64 128, i64 32, i64 256, i32 1, metadata !108, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!124 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"semi", i32 609, i64 32, i64 32, i64 224, i32 0, metadata !125} ; [ DW_TAG_member ]
-!125 = metadata !{i32 786454, metadata !152, metadata !0, metadata !"d_t", i32 35, i64 0, i64 0, i64 0, i32 0, metadata !126} ; [ DW_TAG_typedef ]
-!126 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !127} ; [ DW_TAG_pointer_type ]
-!127 = metadata !{i32 786451, metadata !159, metadata !0, metadata !"my_struct", i32 49, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [my_struct] [line 49, size 0, align 0, offset 0] [decl] [from ]
-!128 = metadata !{i32 786473, metadata !159} ; [ DW_TAG_file_type ]
+!121 = metadata !{metadata !"0xd\00width\0022\0032\0032\000\000", metadata !156, metadata !77, metadata !76} ; [ DW_TAG_member ]
+!122 = metadata !{metadata !"0xd\00height\0023\0032\0032\0032\000", metadata !156, metadata !77, metadata !76} ; [ DW_TAG_member ]
+!123 = metadata !{metadata !"0xd\00_data\0040\00128\0032\00256\001", metadata !152, metadata !24, metadata !108, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!124 = metadata !{metadata !"0xd\00semi\00609\0032\0032\00224\000", metadata !152, metadata !24, metadata !125} ; [ DW_TAG_member ]
+!125 = metadata !{metadata !"0x16\00d_t\0035\000\000\000\000", metadata !152, metadata !0, metadata !126} ; [ DW_TAG_typedef ]
+!126 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !127} ; [ DW_TAG_pointer_type ]
+!127 = metadata !{metadata !"0x13\00my_struct\0049\000\000\000\004\000", metadata !159, metadata !0, null, null, null, null, null} ; [ DW_TAG_structure_type ] [my_struct] [line 49, size 0, align 0, offset 0] [decl] [from ]
+!128 = metadata !{metadata !"0x29", metadata !159} ; [ DW_TAG_file_type ]
 !129 = metadata !{i32 609, i32 144, metadata !23, null}
-!130 = metadata !{i32 786689, metadata !23, metadata !"loadedMydata", metadata !24, i32 33555041, metadata !59, i32 0, null} ; [ DW_TAG_arg_variable ]
+!130 = metadata !{metadata !"0x101\00loadedMydata\0033555041\000", metadata !23, metadata !24, metadata !59} ; [ DW_TAG_arg_variable ]
 !131 = metadata !{i32 609, i32 155, metadata !23, null}
-!132 = metadata !{i32 786689, metadata !23, metadata !"bounds", metadata !24, i32 50332257, metadata !108, i32 0, null} ; [ DW_TAG_arg_variable ]
+!132 = metadata !{metadata !"0x101\00bounds\0050332257\000", metadata !23, metadata !24, metadata !108} ; [ DW_TAG_arg_variable ]
 !133 = metadata !{i32 609, i32 175, metadata !23, null}
-!134 = metadata !{i32 786689, metadata !23, metadata !"data", metadata !24, i32 67109473, metadata !108, i32 0, null} ; [ DW_TAG_arg_variable ]
+!134 = metadata !{metadata !"0x101\00data\0067109473\000", metadata !23, metadata !24, metadata !108} ; [ DW_TAG_arg_variable ]
 !135 = metadata !{i32 609, i32 190, metadata !23, null}
-!136 = metadata !{i32 786688, metadata !23, metadata !"mydata", metadata !24, i32 604, metadata !50, i32 0, null, metadata !163} ; [ DW_TAG_auto_variable ]
+!136 = metadata !{metadata !"0x100\00mydata\00604\000", metadata !23, metadata !24, metadata !50} ; [ DW_TAG_auto_variable ]
 !137 = metadata !{i32 604, i32 49, metadata !23, null}
-!138 = metadata !{i32 786688, metadata !23, metadata !"self", metadata !40, i32 604, metadata !90, i32 0, null, metadata !164} ; [ DW_TAG_auto_variable ]
-!139 = metadata !{i32 786688, metadata !23, metadata !"semi", metadata !24, i32 607, metadata !125, i32 0, null, metadata !165} ; [ DW_TAG_auto_variable ]
+!138 = metadata !{metadata !"0x100\00self\00604\000", metadata !23, metadata !40, metadata !90} ; [ DW_TAG_auto_variable ]
+!139 = metadata !{metadata !"0x100\00semi\00607\000", metadata !23, metadata !24, metadata !125} ; [ DW_TAG_auto_variable ]
 !140 = metadata !{i32 607, i32 30, metadata !23, null}
 !141 = metadata !{i32 610, i32 17, metadata !142, null}
-!142 = metadata !{i32 786443, metadata !152, metadata !23, i32 609, i32 200, i32 94} ; [ DW_TAG_lexical_block ]
+!142 = metadata !{metadata !"0xb\00609\00200\0094", metadata !152, metadata !23} ; [ DW_TAG_lexical_block ]
 !143 = metadata !{i32 611, i32 17, metadata !142, null}
 !144 = metadata !{i32 612, i32 17, metadata !142, null}
 !145 = metadata !{i32 613, i32 17, metadata !142, null}
@@ -257,7 +257,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !159 = metadata !{metadata !"header15.h", metadata !"/Volumes/Sandbox/llvm"}
 !160 = metadata !{metadata !"header.h", metadata !"/Volumes/Sandbox/llvm"}
 !161 = metadata !{metadata !"header2.h", metadata !"/Volumes/Sandbox/llvm"}
-!162 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!163 = metadata !{i64 1, i64 20, i64 2, i64 1, i64 4, i64 2, i64 1, i64 24}
-!164 = metadata !{i64 1, i64 24}
-!165 = metadata !{i64 1, i64 28}
+!162 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!163 = metadata !{metadata !"0x102\0034\0020\006\0034\004\006\0034\0024"} ; [ DW_TAG_expression ] [DW_OP_plus 20 DW_OP_deref DW_OP_plus 4 DW_OP_deref DW_OP_plus 24]
+!164 = metadata !{metadata !"0x102\0034\0024"} ; [ DW_TAG_expression ] [DW_OP_plus 24]
+!165 = metadata !{metadata !"0x102\0034\0028"} ; [ DW_TAG_expression ] [DW_OP_plus 28]
diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
index 8505f53..db96b49 100644
--- a/test/CodeGen/ARM/debug-info-branch-folding.ll
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -3,6 +3,7 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
 target triple = "thumbv7-apple-macosx10.6.7"
 
 ;CHECK: 	vadd.f32	q4, q8, q8
+;CHECK-NEXT: Ltmp1
 ;CHECK-NEXT: LBB0_1
 
 ;CHECK:@DEBUG_VALUE: x <- Q4{{$}}
@@ -19,9 +20,9 @@ entry:
 
 for.body9:                                        ; preds = %for.body9, %entry
   %add19 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
-  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27), !dbg !39
+  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !39
   %add20 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
-  tail call void @llvm.dbg.value(metadata !{<4 x float> %add20}, i64 0, metadata !28), !dbg !39
+  tail call void @llvm.dbg.value(metadata !{<4 x float> %add20}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !39
   br i1 %cond, label %for.end54, label %for.body9, !dbg !44
 
 for.end54:                                        ; preds = %for.body9
@@ -36,58 +37,59 @@ for.end54:                                        ; preds = %for.body9
 
 declare i32 @printf(i8* nocapture, ...) nounwind
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.module.flags = !{!56}
+!llvm.dbg.cu = !{!2}
 
-!0 = metadata !{i32 786478, metadata !54, null, metadata !"test0001", metadata !"test0001", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, <4 x float> (float)* @test0001, null, null, metadata !51, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !54} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !54, i32 12, metadata !"clang version 3.0 (trunk 129915)", i1 true, metadata !"", i32 0, metadata !17, metadata !17, metadata !50, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!0 = metadata !{metadata !"0x2e\00test0001\00test0001\00\003\000\001\000\006\00256\001\000", metadata !54, null, metadata !3, i32 0, <4 x float> (float)* @test0001, null, null, metadata !51} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !54} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 129915)\001\00\000\00\001", metadata !54, metadata !17, metadata !17, metadata !50, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !54, metadata !1, i32 0, metadata !4, i32 0} ; [ DW_TAG_subroutine_type ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786454, metadata !54, metadata !2, metadata !"v4f32", i32 14, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
-!6 = metadata !{i32 786433, metadata !54, metadata !2, metadata !"", i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
-!7 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x16\00v4f32\0014\000\000\000\000", metadata !54, metadata !2, metadata !6} ; [ DW_TAG_typedef ]
+!6 = metadata !{metadata !"0x1\00\000\00128\00128\000\000", metadata !54, metadata !2, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
+!7 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786465, i64 0, i64 4}         ; [ DW_TAG_subrange_type ]
-!10 = metadata !{i32 786478, metadata !54, null, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**, i1)* @main, null, null, metadata !52, i32 0} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 0] [main]
-!11 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x21\000\004"}         ; [ DW_TAG_subrange_type ]
+!10 = metadata !{metadata !"0x2e\00main\00main\00\0059\000\001\000\006\00256\001\000", metadata !54, null, metadata !11, null, i32 (i32, i8**, i1)* @main, null, null, metadata !52} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 0] [main]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !54, metadata !1, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786478, metadata !55, null, metadata !"printFV", metadata !"printFV", metadata !"", i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !53, i32 0} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [scope 0] [printFV]
-!15 = metadata !{i32 786473, metadata !55} ; [ DW_TAG_file_type ]
-!16 = metadata !{i32 786453, metadata !55, metadata !15, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x2e\00printFV\00printFV\00\0041\001\001\000\006\00256\001\000", metadata !55, null, metadata !16, null, null, null, null, metadata !53} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [scope 0] [printFV]
+!15 = metadata !{metadata !"0x29", metadata !55} ; [ DW_TAG_file_type ]
+!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !55, metadata !15, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null}
-!18 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 16777219, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786689, metadata !10, metadata !"argc", metadata !1, i32 16777275, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{i32 786689, metadata !10, metadata !"argv", metadata !1, i32 33554491, metadata !21, i32 0, null} ; [ DW_TAG_arg_variable ]
-!21 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ]
-!22 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{i32 786468, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!24 = metadata !{i32 786688, metadata !25, metadata !"i", metadata !1, i32 60, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ]
-!25 = metadata !{i32 786443, metadata !1, metadata !10, i32 59, i32 33, i32 14} ; [ DW_TAG_lexical_block ]
-!26 = metadata !{i32 786688, metadata !25, metadata !"j", metadata !1, i32 60, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ]
-!27 = metadata !{i32 786688, metadata !25, metadata !"x", metadata !1, i32 61, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!28 = metadata !{i32 786688, metadata !25, metadata !"y", metadata !1, i32 62, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 786688, metadata !25, metadata !"z", metadata !1, i32 63, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!30 = metadata !{i32 786689, metadata !14, metadata !"F", metadata !15, i32 16777257, metadata !31, i32 0, null} ; [ DW_TAG_arg_variable ]
-!31 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_pointer_type ]
-!32 = metadata !{i32 786454, metadata !55, metadata !2, metadata !"FV", i32 25, i64 0, i64 0, i64 0, i32 0, metadata !33} ; [ DW_TAG_typedef ]
-!33 = metadata !{i32 786455, metadata !55, metadata !2, metadata !"", i32 22, i64 128, i64 128, i64 0, i32 0, i32 0, metadata !34, i32 0, i32 0} ; [ DW_TAG_union_type ]
+!18 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x101\00argc\0016777275\000", metadata !10, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{metadata !"0x101\00argv\0033554491\000", metadata !10, metadata !1, metadata !21} ; [ DW_TAG_arg_variable ]
+!21 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !22} ; [ DW_TAG_pointer_type ]
+!22 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
+!24 = metadata !{metadata !"0x100\00i\0060\000", metadata !25, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
+!25 = metadata !{metadata !"0xb\0059\0033\0014", metadata !1, metadata !10} ; [ DW_TAG_lexical_block ]
+!26 = metadata !{metadata !"0x100\00j\0060\000", metadata !25, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
+!27 = metadata !{metadata !"0x100\00x\0061\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!28 = metadata !{metadata !"0x100\00y\0062\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{metadata !"0x100\00z\0063\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!30 = metadata !{metadata !"0x101\00F\0016777257\000", metadata !14, metadata !15, metadata !31} ; [ DW_TAG_arg_variable ]
+!31 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !32} ; [ DW_TAG_pointer_type ]
+!32 = metadata !{metadata !"0x16\00FV\0025\000\000\000\000", metadata !55, metadata !2, metadata !33} ; [ DW_TAG_typedef ]
+!33 = metadata !{metadata !"0x17\00\0022\00128\00128\000\000\000", metadata !55, metadata !2, i32 0, metadata !34, null} ; [ DW_TAG_union_type ]
 !34 = metadata !{metadata !35, metadata !37}
-!35 = metadata !{i32 786445, metadata !55, metadata !15, metadata !"V", i32 23, i64 128, i64 128, i64 0, i32 0, metadata !36} ; [ DW_TAG_member ]
-!36 = metadata !{i32 786454, metadata !55, metadata !2, metadata !"v4sf", i32 3, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
-!37 = metadata !{i32 786445, metadata !55, metadata !15, metadata !"A", i32 24, i64 128, i64 32, i64 0, i32 0, metadata !38} ; [ DW_TAG_member ]
-!38 = metadata !{i32 786433, null, metadata !2, metadata !"", i32 0, i64 128, i64 32, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!35 = metadata !{metadata !"0xd\00V\0023\00128\00128\000\000", metadata !55, metadata !15, metadata !36} ; [ DW_TAG_member ]
+!36 = metadata !{metadata !"0x16\00v4sf\003\000\000\000\000", metadata !55, metadata !2, metadata !6} ; [ DW_TAG_typedef ]
+!37 = metadata !{metadata !"0xd\00A\0024\00128\0032\000\000", metadata !55, metadata !15, metadata !38} ; [ DW_TAG_member ]
+!38 = metadata !{metadata !"0x1\00\000\00128\0032\000\000", null, metadata !2, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
 !39 = metadata !{i32 79, i32 7, metadata !40, null}
-!40 = metadata !{i32 786443, metadata !1, metadata !41, i32 75, i32 35, i32 18} ; [ DW_TAG_lexical_block ]
-!41 = metadata !{i32 786443, metadata !1, metadata !42, i32 75, i32 5, i32 17} ; [ DW_TAG_lexical_block ]
-!42 = metadata !{i32 786443, metadata !1, metadata !43, i32 71, i32 32, i32 16} ; [ DW_TAG_lexical_block ]
-!43 = metadata !{i32 786443, metadata !1, metadata !25, i32 71, i32 3, i32 15} ; [ DW_TAG_lexical_block ]
+!40 = metadata !{metadata !"0xb\0075\0035\0018", metadata !1, metadata !41} ; [ DW_TAG_lexical_block ]
+!41 = metadata !{metadata !"0xb\0075\005\0017", metadata !1, metadata !42} ; [ DW_TAG_lexical_block ]
+!42 = metadata !{metadata !"0xb\0071\0032\0016", metadata !1, metadata !43} ; [ DW_TAG_lexical_block ]
+!43 = metadata !{metadata !"0xb\0071\003\0015", metadata !1, metadata !25} ; [ DW_TAG_lexical_block ]
 !44 = metadata !{i32 75, i32 5, metadata !42, null}
 !45 = metadata !{i32 42, i32 2, metadata !46, metadata !48}
-!46 = metadata !{i32 786443, metadata !15, metadata !47, i32 42, i32 2, i32 20} ; [ DW_TAG_lexical_block ]
-!47 = metadata !{i32 786443, metadata !15, metadata !14, i32 41, i32 28, i32 19} ; [ DW_TAG_lexical_block ]
+!46 = metadata !{metadata !"0xb\0042\002\0020", metadata !15, metadata !47} ; [ DW_TAG_lexical_block ]
+!47 = metadata !{metadata !"0xb\0041\0028\0019", metadata !15, metadata !14} ; [ DW_TAG_lexical_block ]
 !48 = metadata !{i32 95, i32 3, metadata !25, null}
 !49 = metadata !{i32 99, i32 3, metadata !25, null}
 !50 = metadata !{metadata !0, metadata !10, metadata !14}
@@ -96,4 +98,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !53 = metadata !{metadata !30}
 !54 = metadata !{metadata !"build2.c", metadata !"/private/tmp"}
 !55 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp"}
-!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/debug-info-d16-reg.ll b/test/CodeGen/ARM/debug-info-d16-reg.ll
index 30a3e2d..9791987 100644
--- a/test/CodeGen/ARM/debug-info-d16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-d16-reg.ll
@@ -12,9 +12,9 @@ target triple = "thumbv7-apple-darwin10"
 
 define i32 @inlineprinter(i8* %ptr, double %val, i8 zeroext %c) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !19), !dbg !26
-  tail call void @llvm.dbg.value(metadata !{double %val}, i64 0, metadata !20), !dbg !26
-  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !21), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{double %val}, i64 0, metadata !20, metadata !{metadata !"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !26
   %0 = zext i8 %c to i32, !dbg !27
   %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %val, i32 %0) nounwind, !dbg !27
   ret i32 0, !dbg !29
@@ -22,9 +22,9 @@ entry:
 
 define i32 @printer(i8* %ptr, double %val, i8 zeroext %c) nounwind optsize noinline {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !16), !dbg !30
-  tail call void @llvm.dbg.value(metadata !{double %val}, i64 0, metadata !17), !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !18), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{double %val}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !30
   %0 = zext i8 %c to i32, !dbg !31
   %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %val, i32 %0) nounwind, !dbg !31
   ret i32 0, !dbg !33
@@ -32,22 +32,22 @@ entry:
 
 declare i32 @printf(i8* nocapture, ...) nounwind
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !22), !dbg !34
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !23), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !34
   %0 = sitofp i32 %argc to double, !dbg !35
   %1 = fadd double %0, 5.555552e+05, !dbg !35
-  tail call void @llvm.dbg.value(metadata !{double %1}, i64 0, metadata !24), !dbg !35
+  tail call void @llvm.dbg.value(metadata !{double %1}, i64 0, metadata !24, metadata !{metadata !"0x102"}), !dbg !35
   %2 = tail call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0)) nounwind, !dbg !36
   %3 = getelementptr inbounds i8* bitcast (i32 (i32, i8**)* @main to i8*), i32 %argc, !dbg !37
   %4 = trunc i32 %argc to i8, !dbg !37
   %5 = add i8 %4, 97, !dbg !37
-  tail call void @llvm.dbg.value(metadata !{i8* %3}, i64 0, metadata !19) nounwind, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{double %1}, i64 0, metadata !20) nounwind, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{i8 %5}, i64 0, metadata !21) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i8* %3}, i64 0, metadata !19, metadata !{metadata !"0x102"}) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata !{double %1}, i64 0, metadata !20, metadata !{metadata !"0x102"}) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i8 %5}, i64 0, metadata !21, metadata !{metadata !"0x102"}) nounwind, !dbg !38
   %6 = zext i8 %5 to i32, !dbg !39
   %7 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %3, double %1, i32 %6) nounwind, !dbg !39
   %8 = tail call i32 @printer(i8* %3, double %1, i8 zeroext %5) nounwind, !dbg !40
@@ -59,39 +59,39 @@ declare i32 @puts(i8* nocapture) nounwind
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!48}
 
-!0 = metadata !{i32 786478, metadata !46, metadata !1, metadata !"printer", metadata !"printer", metadata !"printer", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, double, i8)* @printer, null, null, metadata !43, i32 12} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !46} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !46, i32 1, metadata !"(LLVM build 00)", i1 true, metadata !"", i32 0, metadata !47, metadata !47, metadata !42, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !46, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00printer\00printer\00printer\0012\000\001\000\006\00256\001\0012", metadata !46, metadata !1, metadata !3, null, i32 (i8*, double, i8)* @printer, null, null, metadata !43} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !46} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\00(LLVM build 00)\001\00\000\00\001", metadata !46, metadata !47, metadata !47, metadata !42, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !46, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !6, metadata !7, metadata !8}
-!5 = metadata !{i32 786468, metadata !46, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786447, metadata !46, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 786468, metadata !46, metadata !1, metadata !"double", i32 0, i64 64, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786468, metadata !46, metadata !1, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786478, metadata !46, metadata !1, metadata !"inlineprinter", metadata !"inlineprinter", metadata !"inlineprinter", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, double, i8)* @inlineprinter, null, null, metadata !44, i32 5} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786478, metadata !46, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 18, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !45, i32 18} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !46, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !46, metadata !1} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !46, metadata !1, null} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0x24\00double\000\0064\0032\000\000\004", metadata !46, metadata !1} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", metadata !46, metadata !1} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x2e\00inlineprinter\00inlineprinter\00inlineprinter\005\000\001\000\006\00256\001\005", metadata !46, metadata !1, metadata !3, null, i32 (i8*, double, i8)* @inlineprinter, null, null, metadata !44} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x2e\00main\00main\00main\0018\000\001\000\006\00256\001\0018", metadata !46, metadata !1, metadata !11, null, i32 (i32, i8**)* @main, null, null, metadata !45} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !46, metadata !1, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !5, metadata !5, metadata !13}
-!13 = metadata !{i32 786447, metadata !46, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786447, metadata !46, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !15} ; [ DW_TAG_pointer_type ]
-!15 = metadata !{i32 786468, metadata !46, metadata !1, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!16 = metadata !{i32 786689, metadata !0, metadata !"ptr", metadata !1, i32 11, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 786689, metadata !0, metadata !"val", metadata !1, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{i32 786689, metadata !0, metadata !"c", metadata !1, i32 11, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786689, metadata !9, metadata !"ptr", metadata !1, i32 4, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{i32 786689, metadata !9, metadata !"val", metadata !1, i32 4, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!21 = metadata !{i32 786689, metadata !9, metadata !"c", metadata !1, i32 4, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{i32 786689, metadata !10, metadata !"argc", metadata !1, i32 17, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!23 = metadata !{i32 786689, metadata !10, metadata !"argv", metadata !1, i32 17, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!24 = metadata !{i32 786688, metadata !25, metadata !"dval", metadata !1, i32 19, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
-!25 = metadata !{i32 786443, metadata !46, metadata !10, i32 18, i32 0, i32 2} ; [ DW_TAG_lexical_block ]
+!13 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !46, metadata !1, metadata !14} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !46, metadata !1, metadata !15} ; [ DW_TAG_pointer_type ]
+!15 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !46, metadata !1} ; [ DW_TAG_base_type ]
+!16 = metadata !{metadata !"0x101\00ptr\0011\000", metadata !0, metadata !1, metadata !6} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{metadata !"0x101\00val\0011\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{metadata !"0x101\00c\0011\000", metadata !0, metadata !1, metadata !8} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x101\00ptr\004\000", metadata !9, metadata !1, metadata !6} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{metadata !"0x101\00val\004\000", metadata !9, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
+!21 = metadata !{metadata !"0x101\00c\004\000", metadata !9, metadata !1, metadata !8} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{metadata !"0x101\00argc\0017\000", metadata !10, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!23 = metadata !{metadata !"0x101\00argv\0017\000", metadata !10, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!24 = metadata !{metadata !"0x100\00dval\0019\000", metadata !25, metadata !1, metadata !7} ; [ DW_TAG_auto_variable ]
+!25 = metadata !{metadata !"0xb\0018\000\002", metadata !46, metadata !10} ; [ DW_TAG_lexical_block ]
 !26 = metadata !{i32 4, i32 0, metadata !9, null}
 !27 = metadata !{i32 6, i32 0, metadata !28, null}
-!28 = metadata !{i32 786443, metadata !46, metadata !9, i32 5, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0xb\005\000\001", metadata !46, metadata !9} ; [ DW_TAG_lexical_block ]
 !29 = metadata !{i32 7, i32 0, metadata !28, null}
 !30 = metadata !{i32 11, i32 0, metadata !0, null}
 !31 = metadata !{i32 13, i32 0, metadata !32, null}
-!32 = metadata !{i32 786443, metadata !46, metadata !0, i32 12, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!32 = metadata !{metadata !"0xb\0012\000\000", metadata !46, metadata !0} ; [ DW_TAG_lexical_block ]
 !33 = metadata !{i32 14, i32 0, metadata !32, null}
 !34 = metadata !{i32 17, i32 0, metadata !10, null}
 !35 = metadata !{i32 19, i32 0, metadata !25, null}
@@ -107,4 +107,4 @@ declare i32 @puts(i8* nocapture) nounwind
 !45 = metadata !{metadata !22, metadata !23, metadata !24}
 !46 = metadata !{metadata !"a.c", metadata !"/tmp/"}
 !47 = metadata !{i32 0}
-!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll
index 03ce312..cfcefb8 100644
--- a/test/CodeGen/ARM/debug-info-qreg.ll
+++ b/test/CodeGen/ARM/debug-info-qreg.ll
@@ -26,7 +26,7 @@ for.body9:                                        ; preds = %for.body9, %entry
   br i1 undef, label %for.end54, label %for.body9, !dbg !44
 
 for.end54:                                        ; preds = %for.body9
-  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27), !dbg !39
+  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !39
   %tmp115 = extractelement <4 x float> %add19, i32 1
   %conv6.i75 = fpext float %tmp115 to double, !dbg !45
   %call.i82 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i75, double undef, double undef) nounwind, !dbg !45
@@ -35,59 +35,59 @@ for.end54:                                        ; preds = %for.body9
 
 declare i32 @printf(i8* nocapture, ...) nounwind
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!56}
 
-!0 = metadata !{i32 786478, metadata !54, metadata !1, metadata !"test0001", metadata !"test0001", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, <4 x float> (float)* @test0001, null, null, metadata !51, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [test0001]
-!1 = metadata !{i32 786473, metadata !54} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !54, i32 12, metadata !"clang version 3.0 (trunk 129915)", i1 true, metadata !"", i32 0, metadata !17, metadata !17, metadata !50, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00test0001\00test0001\00\003\000\001\000\006\00256\001\003", metadata !54, metadata !1, metadata !3, null, <4 x float> (float)* @test0001, null, null, metadata !51} ; [ DW_TAG_subprogram ] [line 3] [def] [test0001]
+!1 = metadata !{metadata !"0x29", metadata !54} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 129915)\001\00\000\00\001", metadata !54, metadata !17, metadata !17, metadata !50, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !54, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786454, metadata !54, metadata !2, metadata !"v4f32", i32 14, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
-!6 = metadata !{i32 786433, metadata !2, null, metadata !2, i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
-!7 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x16\00v4f32\0014\000\000\000\000", metadata !54, metadata !2, metadata !6} ; [ DW_TAG_typedef ]
+!6 = metadata !{metadata !"0x1\00\000\00128\00128\000\000", metadata !2, null, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
+!7 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786465, i64 0, i64 4}         ; [ DW_TAG_subrange_type ]
-!10 = metadata !{i32 786478, metadata !54, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !52, i32 59} ; [ DW_TAG_subprogram ] [line 59] [def] [main]
-!11 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x21\000\004"}         ; [ DW_TAG_subrange_type ]
+!10 = metadata !{metadata !"0x2e\00main\00main\00\0059\000\001\000\006\00256\001\0059", metadata !54, metadata !1, metadata !11, null, i32 (i32, i8**)* @main, null, null, metadata !52} ; [ DW_TAG_subprogram ] [line 59] [def] [main]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !54, metadata !1, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786478, metadata !55, metadata !15, metadata !"printFV", metadata !"printFV", metadata !"", i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !53, i32 41} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [printFV]
-!15 = metadata !{i32 786473, metadata !55} ; [ DW_TAG_file_type ]
-!16 = metadata !{i32 786453, metadata !55, metadata !15, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x2e\00printFV\00printFV\00\0041\001\001\000\006\00256\001\0041", metadata !55, metadata !15, metadata !16, null, null, null, null, metadata !53} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [printFV]
+!15 = metadata !{metadata !"0x29", metadata !55} ; [ DW_TAG_file_type ]
+!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !55, metadata !15, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null}
-!18 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 16777219, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786689, metadata !10, metadata !"argc", metadata !1, i32 16777275, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{i32 786689, metadata !10, metadata !"argv", metadata !1, i32 33554491, metadata !21, i32 0, null} ; [ DW_TAG_arg_variable ]
-!21 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ]
-!22 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{i32 786468, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!24 = metadata !{i32 786688, metadata !25, metadata !"i", metadata !1, i32 60, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ]
-!25 = metadata !{i32 786443, metadata !54, metadata !10, i32 59, i32 33, i32 14} ; [ DW_TAG_lexical_block ]
-!26 = metadata !{i32 786688, metadata !25, metadata !"j", metadata !1, i32 60, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ]
-!27 = metadata !{i32 786688, metadata !25, metadata !"x", metadata !1, i32 61, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!28 = metadata !{i32 786688, metadata !25, metadata !"y", metadata !1, i32 62, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 786688, metadata !25, metadata !"z", metadata !1, i32 63, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!30 = metadata !{i32 786689, metadata !14, metadata !"F", metadata !15, i32 16777257, metadata !31, i32 0, null} ; [ DW_TAG_arg_variable ]
-!31 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_pointer_type ]
-!32 = metadata !{i32 786454, metadata !55, metadata !2, metadata !"FV", i32 25, i64 0, i64 0, i64 0, i32 0, metadata !33} ; [ DW_TAG_typedef ]
-!33 = metadata !{i32 786455, metadata !55, metadata !2, metadata !"", i32 22, i64 128, i64 128, i64 0, i32 0, i32 0, metadata !34, i32 0, null} ; [ DW_TAG_union_type ]
+!18 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x101\00argc\0016777275\000", metadata !10, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{metadata !"0x101\00argv\0033554491\000", metadata !10, metadata !1, metadata !21} ; [ DW_TAG_arg_variable ]
+!21 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !22} ; [ DW_TAG_pointer_type ]
+!22 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
+!24 = metadata !{metadata !"0x100\00i\0060\000", metadata !25, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
+!25 = metadata !{metadata !"0xb\0059\0033\0014", metadata !54, metadata !10} ; [ DW_TAG_lexical_block ]
+!26 = metadata !{metadata !"0x100\00j\0060\000", metadata !25, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
+!27 = metadata !{metadata !"0x100\00x\0061\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!28 = metadata !{metadata !"0x100\00y\0062\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{metadata !"0x100\00z\0063\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!30 = metadata !{metadata !"0x101\00F\0016777257\000", metadata !14, metadata !15, metadata !31} ; [ DW_TAG_arg_variable ]
+!31 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !32} ; [ DW_TAG_pointer_type ]
+!32 = metadata !{metadata !"0x16\00FV\0025\000\000\000\000", metadata !55, metadata !2, metadata !33} ; [ DW_TAG_typedef ]
+!33 = metadata !{metadata !"0x17\00\0022\00128\00128\000\000\000", metadata !55, metadata !2, i32 0, metadata !34, null} ; [ DW_TAG_union_type ]
 !34 = metadata !{metadata !35, metadata !37}
-!35 = metadata !{i32 786445, metadata !55, metadata !15, metadata !"V", i32 23, i64 128, i64 128, i64 0, i32 0, metadata !36} ; [ DW_TAG_member ]
-!36 = metadata !{i32 786454, metadata !55, metadata !2, metadata !"v4sf", i32 3, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
-!37 = metadata !{i32 786445, metadata !55, metadata !15, metadata !"A", i32 24, i64 128, i64 32, i64 0, i32 0, metadata !38} ; [ DW_TAG_member ]
-!38 = metadata !{i32 786433, null, metadata !2, metadata !"", i32 0, i64 128, i64 32, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!35 = metadata !{metadata !"0xd\00V\0023\00128\00128\000\000", metadata !55, metadata !15, metadata !36} ; [ DW_TAG_member ]
+!36 = metadata !{metadata !"0x16\00v4sf\003\000\000\000\000", metadata !55, metadata !2, metadata !6} ; [ DW_TAG_typedef ]
+!37 = metadata !{metadata !"0xd\00A\0024\00128\0032\000\000", metadata !55, metadata !15, metadata !38} ; [ DW_TAG_member ]
+!38 = metadata !{metadata !"0x1\00\000\00128\0032\000\000", null, metadata !2, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
 !39 = metadata !{i32 79, i32 7, metadata !40, null}
-!40 = metadata !{i32 786443, metadata !54, metadata !41, i32 75, i32 35, i32 18} ; [ DW_TAG_lexical_block ]
-!41 = metadata !{i32 786443, metadata !54, metadata !42, i32 75, i32 5, i32 17} ; [ DW_TAG_lexical_block ]
-!42 = metadata !{i32 786443, metadata !54, metadata !43, i32 71, i32 32, i32 16} ; [ DW_TAG_lexical_block ]
-!43 = metadata !{i32 786443, metadata !54, metadata !25, i32 71, i32 3, i32 15} ; [ DW_TAG_lexical_block ]
+!40 = metadata !{metadata !"0xb\0075\0035\0018", metadata !54, metadata !41} ; [ DW_TAG_lexical_block ]
+!41 = metadata !{metadata !"0xb\0075\005\0017", metadata !54, metadata !42} ; [ DW_TAG_lexical_block ]
+!42 = metadata !{metadata !"0xb\0071\0032\0016", metadata !54, metadata !43} ; [ DW_TAG_lexical_block ]
+!43 = metadata !{metadata !"0xb\0071\003\0015", metadata !54, metadata !25} ; [ DW_TAG_lexical_block ]
 !44 = metadata !{i32 75, i32 5, metadata !42, null}
 !45 = metadata !{i32 42, i32 2, metadata !46, metadata !48}
-!46 = metadata !{i32 786443, metadata !55, metadata !47, i32 42, i32 2, i32 20} ; [ DW_TAG_lexical_block ]
-!47 = metadata !{i32 786443, metadata !55, metadata !14, i32 41, i32 28, i32 19} ; [ DW_TAG_lexical_block ]
+!46 = metadata !{metadata !"0xb\0042\002\0020", metadata !55, metadata !47} ; [ DW_TAG_lexical_block ]
+!47 = metadata !{metadata !"0xb\0041\0028\0019", metadata !55, metadata !14} ; [ DW_TAG_lexical_block ]
 !48 = metadata !{i32 95, i32 3, metadata !25, null}
 !49 = metadata !{i32 99, i32 3, metadata !25, null}
 !50 = metadata !{metadata !0, metadata !10, metadata !14}
@@ -96,4 +96,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !53 = metadata !{metadata !30}
 !54 = metadata !{metadata !"build2.c", metadata !"/private/tmp"}
 !55 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp"}
-!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/debug-info-s16-reg.ll b/test/CodeGen/ARM/debug-info-s16-reg.ll
index ee9faf8..6bd7172 100644
--- a/test/CodeGen/ARM/debug-info-s16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-s16-reg.ll
@@ -15,9 +15,9 @@ target triple = "thumbv7-apple-macosx10.6.7"
 
 define i32 @inlineprinter(i8* %ptr, float %val, i8 zeroext %c) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !8), !dbg !24
-  tail call void @llvm.dbg.value(metadata !{float %val}, i64 0, metadata !10), !dbg !25
-  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !12), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !8, metadata !{metadata !"0x102"}), !dbg !24
+  tail call void @llvm.dbg.value(metadata !{float %val}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !25
+  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !26
   %conv = fpext float %val to double, !dbg !27
   %conv3 = zext i8 %c to i32, !dbg !27
   %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %conv, i32 %conv3) nounwind optsize, !dbg !27
@@ -28,9 +28,9 @@ declare i32 @printf(i8* nocapture, ...) nounwind optsize
 
 define i32 @printer(i8* %ptr, float %val, i8 zeroext %c) nounwind optsize noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !14), !dbg !30
-  tail call void @llvm.dbg.value(metadata !{float %val}, i64 0, metadata !15), !dbg !31
-  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !16), !dbg !32
+  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{float %val}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !31
+  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !32
   %conv = fpext float %val to double, !dbg !33
   %conv3 = zext i8 %c to i32, !dbg !33
   %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %conv, i32 %conv3) nounwind optsize, !dbg !33
@@ -39,19 +39,19 @@ entry:
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !17), !dbg !36
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !18), !dbg !37
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !37
   %conv = sitofp i32 %argc to double, !dbg !38
   %add = fadd double %conv, 5.555552e+05, !dbg !38
   %conv1 = fptrunc double %add to float, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{float %conv1}, i64 0, metadata !22), !dbg !38
+  tail call void @llvm.dbg.value(metadata !{float %conv1}, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !38
   %call = tail call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0)) nounwind optsize, !dbg !39
   %add.ptr = getelementptr i8* bitcast (i32 (i32, i8**)* @main to i8*), i32 %argc, !dbg !40
   %add5 = add nsw i32 %argc, 97, !dbg !40
   %conv6 = trunc i32 %add5 to i8, !dbg !40
-  tail call void @llvm.dbg.value(metadata !{i8* %add.ptr}, i64 0, metadata !8) nounwind, !dbg !41
-  tail call void @llvm.dbg.value(metadata !{float %conv1}, i64 0, metadata !10) nounwind, !dbg !42
-  tail call void @llvm.dbg.value(metadata !{i8 %conv6}, i64 0, metadata !12) nounwind, !dbg !43
+  tail call void @llvm.dbg.value(metadata !{i8* %add.ptr}, i64 0, metadata !8, metadata !{metadata !"0x102"}) nounwind, !dbg !41
+  tail call void @llvm.dbg.value(metadata !{float %conv1}, i64 0, metadata !10, metadata !{metadata !"0x102"}) nounwind, !dbg !42
+  tail call void @llvm.dbg.value(metadata !{i8 %conv6}, i64 0, metadata !12, metadata !{metadata !"0x102"}) nounwind, !dbg !43
   %conv.i = fpext float %conv1 to double, !dbg !44
   %conv3.i = and i32 %add5, 255, !dbg !44
   %call.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %add.ptr, double %conv.i, i32 %conv3.i) nounwind optsize, !dbg !44
@@ -61,46 +61,46 @@ entry:
 
 declare i32 @puts(i8* nocapture) nounwind optsize
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!53}
 
-!0 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"inlineprinter", metadata !"inlineprinter", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, float, i8)* @inlineprinter, null, null, metadata !48, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [inlineprinter]
-!1 = metadata !{i32 786473, metadata !51} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !51, i32 12, metadata !"clang version 3.0 (trunk 129915)", i1 true, metadata !"", i32 0, metadata !52, metadata !52, metadata !47, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !51, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00inlineprinter\00inlineprinter\00\005\000\001\000\006\00256\001\005", metadata !51, metadata !1, metadata !3, null, i32 (i8*, float, i8)* @inlineprinter, null, null, metadata !48} ; [ DW_TAG_subprogram ] [line 5] [def] [inlineprinter]
+!1 = metadata !{metadata !"0x29", metadata !51} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 129915)\001\00\000\00\001", metadata !51, metadata !52, metadata !52, metadata !47, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !51, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"printer", metadata !"printer", metadata !"", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, float, i8)* @printer, null, null, metadata !49, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [printer]
-!7 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !50, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
-!8 = metadata !{i32 786689, metadata !0, metadata !"ptr", metadata !1, i32 16777220, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 786689, metadata !0, metadata !"val", metadata !1, i32 33554436, metadata !11, i32 0, null} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!12 = metadata !{i32 786689, metadata !0, metadata !"c", metadata !1, i32 50331652, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786689, metadata !6, metadata !"ptr", metadata !1, i32 16777227, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 786689, metadata !6, metadata !"val", metadata !1, i32 33554443, metadata !11, i32 0, null} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{i32 786689, metadata !6, metadata !"c", metadata !1, i32 50331659, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 786689, metadata !7, metadata !"argc", metadata !1, i32 16777233, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{i32 786689, metadata !7, metadata !"argv", metadata !1, i32 33554449, metadata !19, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ]
-!20 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !21} ; [ DW_TAG_pointer_type ]
-!21 = metadata !{i32 786468, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!22 = metadata !{i32 786688, metadata !23, metadata !"dval", metadata !1, i32 19, metadata !11, i32 0, null} ; [ DW_TAG_auto_variable ]
-!23 = metadata !{i32 786443, metadata !51, metadata !7, i32 18, i32 1, i32 2} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00printer\00printer\00\0012\000\001\000\006\00256\001\0012", metadata !51, metadata !1, metadata !3, null, i32 (i8*, float, i8)* @printer, null, null, metadata !49} ; [ DW_TAG_subprogram ] [line 12] [def] [printer]
+!7 = metadata !{metadata !"0x2e\00main\00main\00\0018\000\001\000\006\00256\001\0018", metadata !51, metadata !1, metadata !3, null, i32 (i32, i8**)* @main, null, null, metadata !50} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
+!8 = metadata !{metadata !"0x101\00ptr\0016777220\000", metadata !0, metadata !1, metadata !9} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, null} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{metadata !"0x101\00val\0033554436\000", metadata !0, metadata !1, metadata !11} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
+!12 = metadata !{metadata !"0x101\00c\0050331652\000", metadata !0, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!13 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", null, metadata !2} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x101\00ptr\0016777227\000", metadata !6, metadata !1, metadata !9} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{metadata !"0x101\00val\0033554443\000", metadata !6, metadata !1, metadata !11} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{metadata !"0x101\00c\0050331659\000", metadata !6, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{metadata !"0x101\00argc\0016777233\000", metadata !7, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{metadata !"0x101\00argv\0033554449\000", metadata !7, metadata !1, metadata !19} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !20} ; [ DW_TAG_pointer_type ]
+!20 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !21} ; [ DW_TAG_pointer_type ]
+!21 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
+!22 = metadata !{metadata !"0x100\00dval\0019\000", metadata !23, metadata !1, metadata !11} ; [ DW_TAG_auto_variable ]
+!23 = metadata !{metadata !"0xb\0018\001\002", metadata !51, metadata !7} ; [ DW_TAG_lexical_block ]
 !24 = metadata !{i32 4, i32 22, metadata !0, null}
 !25 = metadata !{i32 4, i32 33, metadata !0, null}
 !26 = metadata !{i32 4, i32 52, metadata !0, null}
 !27 = metadata !{i32 6, i32 3, metadata !28, null}
-!28 = metadata !{i32 786443, metadata !51, metadata !0, i32 5, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0xb\005\001\000", metadata !51, metadata !0} ; [ DW_TAG_lexical_block ]
 !29 = metadata !{i32 7, i32 3, metadata !28, null}
 !30 = metadata !{i32 11, i32 42, metadata !6, null}
 !31 = metadata !{i32 11, i32 53, metadata !6, null}
 !32 = metadata !{i32 11, i32 72, metadata !6, null}
 !33 = metadata !{i32 13, i32 3, metadata !34, null}
-!34 = metadata !{i32 786443, metadata !51, metadata !6, i32 12, i32 1, i32 1} ; [ DW_TAG_lexical_block ]
+!34 = metadata !{metadata !"0xb\0012\001\001", metadata !51, metadata !6} ; [ DW_TAG_lexical_block ]
 !35 = metadata !{i32 14, i32 3, metadata !34, null}
 !36 = metadata !{i32 17, i32 15, metadata !7, null}
 !37 = metadata !{i32 17, i32 28, metadata !7, null}
@@ -119,4 +119,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !50 = metadata !{metadata !17, metadata !18, metadata !22}
 !51 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
 !52 = metadata !{i32 0}
-!53 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!53 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll
index 71a696a..4374b9e 100644
--- a/test/CodeGen/ARM/debug-info-sreg2.ll
+++ b/test/CodeGen/ARM/debug-info-sreg2.ll
@@ -1,26 +1,21 @@
-; RUN: llc < %s - | FileCheck %s
+; RUN: llc < %s - -filetype=obj | llvm-dwarfdump -debug-dump=loc - | FileCheck %s
 ; Radar 9376013
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-macosx10.6.7"
 
-;CHECK-LABEL: Lfunc_begin0:
-;CHECK: Ltmp[[K:[0-9]+]]:
-;CHECK: Ltmp[[L:[0-9]+]]:
-;CHECK-LABEL: Ldebug_loc0:
-;CHECK-NEXT:        .long   Ltmp[[K]]
-;CHECK-NEXT:        .long   Ltmp[[L]]
-;CHECK-NEXT: Lset[[N:[0-9]+]] = Ltmp{{[0-9]+}}-Ltmp[[M:[0-9]+]]        @ Loc expr size
-;CHECK-NEXT:        .short  Lset[[N]]
-;CHECK-NEXT: Ltmp[[M]]:
-;CHECK-NEXT:        .byte   144                     @ super-register
-;CHECK-NEXT:                                        @ DW_OP_regx
-;CHECK-NEXT:        .ascii
-;CHECK-NEXT:        .byte   {{[0-9]+}}              @ DW_OP_{{.*}}piece
+; Just making sure the first part of the location isn't a repetition
+; of the size of the location description.
+;
+; 0x90   DW_OP_regx of super-register
+
+; CHECK: 0x00000000: Beginning address offset:
+; CHECK-NEXT:           Ending address offset:
+; CHECK-NEXT:            Location description: 90 {{.. .. .. .. $}}
 
 define void @_Z3foov() optsize ssp {
 entry:
   %call = tail call float @_Z3barv() optsize, !dbg !11
-  tail call void @llvm.dbg.value(metadata !{float %call}, i64 0, metadata !5), !dbg !11
+  tail call void @llvm.dbg.value(metadata !{float %call}, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !11
   %call16 = tail call float @_Z2f2v() optsize, !dbg !12
   %cmp7 = fcmp olt float %call, %call16, !dbg !12
   br i1 %cmp7, label %for.body, label %for.end, !dbg !12
@@ -43,22 +38,22 @@ declare float @_Z2f2v() optsize
 
 declare float @_Z2f3f(float) optsize
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{i32 786449, metadata !18, i32 4, metadata !"clang version 3.0 (trunk 130845)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, metadata !16, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z3foov, null, null, metadata !17, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
-!2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.0 (trunk 130845)\001\00\000\00\001", metadata !18, metadata !19, metadata !19, metadata !16, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\005\000\001\000\006\00256\001\005", metadata !18, metadata !2, metadata !3, null, void ()* @_Z3foov, null, null, metadata !17} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
-!5 = metadata !{i32 786688, metadata !6, metadata !"k", metadata !2, i32 6, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
-!6 = metadata !{i32 786443, metadata !18, metadata !1, i32 5, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
-!7 = metadata !{i32 786468, null, metadata !0, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786688, metadata !9, metadata !"y", metadata !2, i32 8, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
-!9 = metadata !{i32 786443, metadata !18, metadata !10, i32 7, i32 25, i32 2} ; [ DW_TAG_lexical_block ]
-!10 = metadata !{i32 786443, metadata !18, metadata !6, i32 7, i32 3, i32 1} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x100\00k\006\000", metadata !6, metadata !2, metadata !7} ; [ DW_TAG_auto_variable ]
+!6 = metadata !{metadata !"0xb\005\0012\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !0} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x100\00y\008\000", metadata !9, metadata !2, metadata !7} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{metadata !"0xb\007\0025\002", metadata !18, metadata !10} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0xb\007\003\001", metadata !18, metadata !6} ; [ DW_TAG_lexical_block ]
 !11 = metadata !{i32 6, i32 18, metadata !6, null}
 !12 = metadata !{i32 7, i32 3, metadata !6, null}
 !13 = metadata !{i32 8, i32 20, metadata !9, null}
@@ -68,4 +63,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !17 = metadata !{metadata !5, metadata !8}
 !18 = metadata !{metadata !"k.cc", metadata !"/private/tmp"}
 !19 = metadata !{i32 0}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/ARM/debug-segmented-stacks.ll b/test/CodeGen/ARM/debug-segmented-stacks.ll
index e866b4e..2123fa7 100644
--- a/test/CodeGen/ARM/debug-segmented-stacks.ll
+++ b/test/CodeGen/ARM/debug-segmented-stacks.ll
@@ -39,39 +39,37 @@ define void @test_basic() #0 {
 ; ARM-linux       .cfi_same_value r5
 }
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"var.c", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_basic",
-  metadata !"test_basic", metadata !"", i32 5, metadata !6, i1 false, i1 true,
-  i32 0, i32 0, null, i32 256, i1 false, void ()* @test_basic, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00test_basic\00test_basic\00\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, void ()* @test_basic, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5 "}
-!12 = metadata !{i32 786689, metadata !4, metadata !"count", metadata !5, i32 16777221, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [count] [line 5]
+!12 = metadata !{metadata !"0x101\00count\0016777221\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [count] [line 5]
 !13 = metadata !{i32 5, i32 0, metadata !4, null}
-!14 = metadata !{i32 786688, metadata !4, metadata !"vl", metadata !5, i32 6, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vl] [line 6]
-!15 = metadata !{i32 786454, metadata !16, null, metadata !"va_list", i32 30, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
+!14 = metadata !{metadata !"0x100\00vl\006\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [vl] [line 6]
+!15 = metadata !{metadata !"0x16\00va_list\0030\000\000\000\000", metadata !16, null, metadata !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
 !16 = metadata !{metadata !"/linux-x86_64-high/gcc_4.7.2/dbg/llvm/bin/../lib/clang/3.5/include/stdarg.h", metadata !"/tmp"}
-!17 = metadata !{i32 786454, metadata !1, null, metadata !"__builtin_va_list", i32 6, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
-!18 = metadata !{i32 786451, metadata !1, null, metadata !"__va_list", i32 6, i64 32, i64 32, i32 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
+!17 = metadata !{metadata !"0x16\00__builtin_va_list\006\000\000\000\000", metadata !1, null, metadata !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
+!18 = metadata !{metadata !"0x13\00__va_list\006\0032\0032\000\000\000", metadata !1, null, null, metadata !19, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
 !19 = metadata !{metadata !20}
-!20 = metadata !{i32 786445, metadata !1, metadata !18, metadata !"__ap", i32 6, i64 32, i64 32, i64 0, i32 0, metadata !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
-!21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
+!20 = metadata !{metadata !"0xd\00__ap\006\0032\0032\000\000", metadata !1, metadata !18, metadata !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
+!21 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
 !22 = metadata !{i32 6, i32 0, metadata !4, null}
 !23 = metadata !{i32 7, i32 0, metadata !4, null}
-!24 = metadata !{i32 786688, metadata !4, metadata !"test_basic", metadata !5, i32 8, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [sum] [line 8]
-!25 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
-!26 = metadata !{i32 786688, metadata !27, metadata !"i", metadata !5, i32 9, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 9]
-!27 = metadata !{i32 786443, metadata !1, metadata !4, i32 9, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!24 = metadata !{metadata !"0x100\00test_basic\008\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [sum] [line 8]
+!25 = metadata !{i32 8, i32 0, metadata !4, null}
+!26 = metadata !{metadata !"0x100\00i\009\000", metadata !27, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 9]
+!27 = metadata !{metadata !"0xb\009\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
 !28 = metadata !{i32 9, i32 0, metadata !27, null}
 !29 = metadata !{i32 10, i32 0, metadata !30, null}
-!30 = metadata !{i32 786443, metadata !1, metadata !27, i32 9, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!30 = metadata !{metadata !"0xb\009\000\001", metadata !1, metadata !27} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
 !31 = metadata !{i32 11, i32 0, metadata !30, null}
 !32 = metadata !{i32 12, i32 0, metadata !4, null}
 !33 = metadata !{i32 13, i32 0, metadata !4, null}
diff --git a/test/CodeGen/ARM/dwarf-unwind.ll b/test/CodeGen/ARM/dwarf-unwind.ll
new file mode 100644
index 0000000..5256db8
--- /dev/null
+++ b/test/CodeGen/ARM/dwarf-unwind.ll
@@ -0,0 +1,82 @@
+; RUN: llc -mtriple=thumbv7-netbsd-eabi -o - %s | FileCheck %s
+declare void @bar()
+
+; ARM's frame lowering attempts to tack another callee-saved register onto the
+; list when it detects a potential misaligned VFP store. However, if there are
+; none available it used to just vpush anyway and misreport the location of the
+; registers in unwind info. Since there are benefits to aligned stores, it's
+; better to correct the code than the .cfi_offset directive.
+
+define void @test_dpr_align(i8 %l, i8 %r) {
+; CHECK-LABEL: test_dpr_align:
+; CHECK: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK: .cfi_def_cfa_offset 36
+; CHECK: sub sp, #4
+; CHECK: .cfi_def_cfa_offset 40
+; CHECK: vpush {d8}
+; CHECK: .cfi_offset d8, -48
+; CHECK-NOT: sub sp
+; [...]
+; CHECK: bl bar
+; CHECK-NOT: add sp
+; CHECK: vpop {d8}
+; CHECK: add sp, #4
+; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{d8}"()
+  call void @bar()
+  ret void
+}
+
+; The prologue (but not the epilogue) can be made more space efficient by
+; chucking an argument register into the list. Not worth it in general though,
+; "sub sp, #4" is likely faster.
+define void @test_dpr_align_tiny(i8 %l, i8 %r) minsize {
+; CHECK-LABEL: test_dpr_align_tiny:
+; CHECK: push.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NOT: sub sp
+; CHECK: vpush {d8}
+; CHECK: .cfi_offset d8, -48
+; CHECK-NOT: sub sp
+; [...]
+; CHECK: bl bar
+; CHECK-NOT: add sp
+; CHECK: vpop {d8}
+; CHECK: add sp, #4
+; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{d8}"()
+  call void @bar()
+  ret void
+}
+
+
+; However, we shouldn't do a 2-step align/adjust if there are no DPRs to be
+; saved.
+define void @test_nodpr_noalign(i8 %l, i8 %r) {
+; CHECK-LABEL: test_nodpr_noalign:
+; CHECK: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NOT: sub sp
+; CHECK: sub sp, #12
+; CHECK-NOT: sub sp
+; [...]
+; CHECK: bl bar
+; CHECK-NOT: add sp
+; CHECK: add sp, #12
+; CHECK-NOT: add sp
+; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+  alloca i64
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11}"()
+  call void @bar()
+  ret void
+}
+
+define void @test_frame_pointer_offset() minsize "no-frame-pointer-elim"="true" {
+; CHECK-LABEL: test_frame_pointer_offset:
+; CHECK: push.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK: .cfi_def_cfa_offset 40
+; CHECK: add r7, sp, #16
+; CHECK: .cfi_def_cfa r7, 24
+; CHECK-NOT: .cfi_def_cfa_offset
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{d8}"()
+  call void @bar()
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/fabs-neon.ll b/test/CodeGen/ARM/fabs-neon.ll
index e3094aa..dc1dc32 100644
--- a/test/CodeGen/ARM/fabs-neon.ll
+++ b/test/CodeGen/ARM/fabs-neon.ll
@@ -15,3 +15,42 @@ define <2 x float> @test2(<2 x float> %a) {
     ret <2 x float> %foo
 }
 declare <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+
+; No constant pool loads or vector ops are needed for the fabs of a
+; bitcasted integer constant; we should just return integer constants
+; that have the sign bits turned off.
+;
+; So instead of something like this:
+; 	mvn	r0, #0
+; 	mov	r1, #0
+; 	vmov	d16, r1, r0
+; 	vabs.f32	d16, d16
+; 	vmov	r0, r1, d16
+; 	bx	lr
+;
+; We should generate:
+;	mov	r0, #0
+;	mvn	r1, #-2147483648
+;	bx	lr
+
+define i64 @fabs_v2f32_1() {
+; CHECK-LABEL: fabs_v2f32_1:
+; CHECK: mvn r1, #-2147483648
+; CHECK: bx lr
+; CHECK-NOT: vabs
+ %bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+}
+
+define i64 @fabs_v2f32_2() {
+; CHECK-LABEL: fabs_v2f32_2:
+; CHECK: mvn r0, #-2147483648
+; CHECK: bx lr
+; CHECK-NOT: vabs
+ %bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+}
diff --git a/test/CodeGen/ARM/fast-isel-call.ll b/test/CodeGen/ARM/fast-isel-call.ll
index 2d7378e..74b31bd 100644
--- a/test/CodeGen/ARM/fast-isel-call.ll
+++ b/test/CodeGen/ARM/fast-isel-call.ll
@@ -117,17 +117,11 @@ entry:
 ; ARM-LONG: blx [[R]]
 ; THUMB: @t10
 ; THUMB: movs [[R0:l?r[0-9]*]], #0
-; THUMB: movt [[R0]], #0
 ; THUMB: movs [[R1:l?r[0-9]*]], #248
-; THUMB: movt [[R1]], #0
 ; THUMB: movs [[R2:l?r[0-9]*]], #187
-; THUMB: movt [[R2]], #0
 ; THUMB: movs [[R3:l?r[0-9]*]], #28
-; THUMB: movt [[R3]], #0
 ; THUMB: movw [[R4:l?r[0-9]*]], #40
-; THUMB: movt [[R4]], #0
 ; THUMB: movw [[R5:l?r[0-9]*]], #186
-; THUMB: movt [[R5]], #0
 ; THUMB: and [[R0]], [[R0]], #255
 ; THUMB: and [[R1]], [[R1]], #255
 ; THUMB: and [[R2]], [[R2]], #255
@@ -250,4 +244,19 @@ entry:
   ret void
 }
 
+declare void @bar2(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6)
+
+define void @call_undef_args() {
+; ARM-LABEL: call_undef_args
+; ARM:       movw  r0, #1
+; ARM-NEXT:  movw  r1, #2
+; ARM-NEXT:  movw  r2, #3
+; ARM-NEXT:  movw  r3, #4
+; ARM-NOT:   str {{r[0-9]+}}, [sp]
+; ARM:       movw  [[REG:l?r[0-9]*]], #6
+; ARM-NEXT:  str [[REG]], [sp, #4]
+  call void @bar2(i32 1, i32 2, i32 3, i32 4, i32 undef, i32 6)
+  ret void
+}
+
 declare void @print(float)
diff --git a/test/CodeGen/ARM/fast-isel-deadcode.ll b/test/CodeGen/ARM/fast-isel-deadcode.ll
index 5e6666c..c3eed30 100644
--- a/test/CodeGen/ARM/fast-isel-deadcode.ll
+++ b/test/CodeGen/ARM/fast-isel-deadcode.ll
@@ -14,7 +14,6 @@ entry:
 ; THUMB-NOT: ldr
 ; THUMB-NOT: sxtb
 ; THUMB: movs r0, #0
-; THUMB: movt r0, #0
 ; THUMB: pop
   ret i32 0
 }
diff --git a/test/CodeGen/ARM/fast-isel-inline-asm.ll b/test/CodeGen/ARM/fast-isel-inline-asm.ll
new file mode 100644
index 0000000..2eb25ec
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-inline-asm.ll
@@ -0,0 +1,18 @@
+; RUN: llc -fast-isel < %s | FileCheck %s
+target datalayout = "e-m:o-p:32:32-i1:8:32-i8:8:32-i16:16:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7-apple-ios5.0.0"
+
+%0 = type opaque
+
+; Make sure that the inline asm starts right after the call to bar.
+define void @test_inline_asm_sideeffect(%0* %call) {
+; CHECK:      bl _bar
+; CHECK-NEXT: InlineAsm Start
+  call void @bar()
+  call void asm sideeffect "mov\09r7, r7\09\09@ marker", ""()
+  %1 = call %0* bitcast (i8* (i8*)* @foo to %0* (%0*)*)(%0* %call)
+  ret void
+}
+
+declare i8* @foo(i8*)
+declare void @bar()
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index 089209e..b09931d 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -31,9 +31,7 @@ define void @t1() nounwind ssp {
 ; THUMB: {{(movt r0, :upper16:_?message1)|(ldr r0, \[r0\])}}
 ; THUMB: adds r0, #5
 ; THUMB: movs r1, #64
-; THUMB: movt r1, #0
 ; THUMB: movs r2, #10
-; THUMB: movt r2, #0
 ; THUMB: and r1, r1, #255
 ; THUMB: bl {{_?}}memset
 ; THUMB-LONG-LABEL: t1:
@@ -71,7 +69,6 @@ define void @t2() nounwind ssp {
 ; THUMB: adds r1, r0, #4
 ; THUMB: adds r0, #16
 ; THUMB: movs r2, #17
-; THUMB: movt r2, #0
 ; THUMB: str r0, [sp[[SLOT:[, #0-9]*]]] @ 4-byte Spill
 ; THUMB: mov r0, r1
 ; THUMB: ldr r1,  [sp[[SLOT]]] @ 4-byte Reload
@@ -109,7 +106,6 @@ define void @t3() nounwind ssp {
 ; THUMB: adds r1, r0, #4
 ; THUMB: adds r0, #16
 ; THUMB: movs r2, #10
-; THUMB: movt r2, #0
 ; THUMB: str r0, [sp[[SLOT:[, #0-9]*]]] @ 4-byte Spill
 ; THUMB: mov r0, r1
 ; THUMB: ldr r1,  [sp[[SLOT]]] @ 4-byte Reload
diff --git a/test/CodeGen/ARM/fast-isel-mvn.ll b/test/CodeGen/ARM/fast-isel-mvn.ll
index 0bc9395..886f2da 100644
--- a/test/CodeGen/ARM/fast-isel-mvn.ll
+++ b/test/CodeGen/ARM/fast-isel-mvn.ll
@@ -1,17 +1,14 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -arm-use-movt=false -mtriple=armv7-apple-ios     < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -arm-use-movt=false -mtriple=armv7-linux-gnueabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -arm-use-movt=false -mtriple=thumbv7-apple-ios   < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -arm-use-movt=true  -mtriple=thumbv7-apple-ios   < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -arm-use-movt=true  -mtriple=armv7-apple-ios     < %s | FileCheck %s --check-prefix=MOVT
 ; rdar://10412592
 
-; Note: The Thumb code is being generated by the target-independent selector.
-
 define void @t1() nounwind {
 entry:
-; ARM: t1
-; THUMB: t1
-; ARM: mvn r0, #0
-; THUMB: movw r0, #65535
-; THUMB: movt r0, #65535
+; CHECK-LABEL: t1
+; CHECK:       mvn r0, #0
   call void @foo(i32 -1)
   ret void
 }
@@ -20,22 +17,16 @@ declare void @foo(i32)
 
 define void @t2() nounwind {
 entry:
-; ARM: t2
-; THUMB: t2
-; ARM: mvn r0, #233
-; THUMB: movw r0, #65302
-; THUMB: movt r0, #65535
+; CHECK-LABEL: t2
+; CHECK:       mvn r0, #233
   call void @foo(i32 -234)
   ret void
 }
 
 define void @t3() nounwind {
 entry:
-; ARM: t3
-; THUMB: t3
-; ARM: mvn	r0, #256
-; THUMB: movw r0, #65279
-; THUMB: movt r0, #65535
+; CHECK-LABEL: t3
+; CHECK:       mvn r0, #256
   call void @foo(i32 -257)
   ret void
 }
@@ -43,66 +34,60 @@ entry:
 ; Load from constant pool
 define void @t4() nounwind {
 entry:
-; ARM: t4
-; THUMB: t4
-; ARM: ldr	r0
-; THUMB: movw r0, #65278
-; THUMB: movt r0, #65535
+; ARM-LABEL:   t4
+; ARM:         ldr r0
+; THUMB-LABEL: t4
+; THUMB:       movw r0, #65278
+; THUMB:       movt r0, #65535
   call void @foo(i32 -258)
   ret void
 }
 
 define void @t5() nounwind {
 entry:
-; ARM: t5
-; THUMB: t5
-; ARM: mvn r0, #65280
-; THUMB: movs r0, #255
-; THUMB: movt r0, #65535
+; CHECK-LABEL: t5
+; CHECK:       mvn r0, #65280
   call void @foo(i32 -65281)
   ret void
 }
 
 define void @t6() nounwind {
 entry:
-; ARM: t6
-; THUMB: t6
-; ARM: mvn r0, #978944
-; THUMB: movw r0, #4095
-; THUMB: movt r0, #65521
+; CHECK-LABEL: t6
+; CHECK:       mvn r0, #978944
   call void @foo(i32 -978945)
   ret void
 }
 
 define void @t7() nounwind {
 entry:
-; ARM: t7
-; THUMB: t7
-; ARM: mvn r0, #267386880
-; THUMB: movw r0, #65535
-; THUMB: movt r0, #61455
+; CHECK-LABEL: t7
+; CHECK:       mvn r0, #267386880
   call void @foo(i32 -267386881)
   ret void
 }
 
 define void @t8() nounwind {
 entry:
-; ARM: t8
-; THUMB: t8
-; ARM: mvn r0, #65280
-; THUMB: movs r0, #255
-; THUMB: movt r0, #65535
+; CHECK-LABEL: t8
+; CHECK:       mvn r0, #65280
   call void @foo(i32 -65281)
   ret void
 }
 
 define void @t9() nounwind {
 entry:
-; ARM: t9
-; THUMB: t9
-; ARM: mvn r0, #2130706432
-; THUMB: movw r0, #65535
-; THUMB: movt r0, #33023
+; CHECK-LABEL: t9
+; CHECK:       mvn r0, #2130706432
   call void @foo(i32 -2130706433)
   ret void
 }
+
+; Load from constant pool.
+define i32 @t10(i32 %a) {
+; MOVT-LABEL: t10
+; MOVT:       ldr
+  %1 = xor i32 -1998730207, %a
+  ret i32 %1
+}
+
diff --git a/test/CodeGen/ARM/fast-isel-select.ll b/test/CodeGen/ARM/fast-isel-select.ll
index 40f8807..549c97e 100644
--- a/test/CodeGen/ARM/fast-isel-select.ll
+++ b/test/CodeGen/ARM/fast-isel-select.ll
@@ -12,7 +12,6 @@ entry:
 ; ARM: mov r0, r{{[1-9]}}
 ; THUMB: t1
 ; THUMB: movs r{{[1-9]}}, #10
-; THUMB: movt r{{[1-9]}}, #0
 ; THUMB: cmp r0, #0
 ; THUMB: it eq
 ; THUMB: moveq r{{[1-9]}}, #20
@@ -59,13 +58,12 @@ entry:
 ; ARM: cmp r0, #0
 ; ARM: mvneq r{{[1-9]}}, #0
 ; ARM: mov r0, r{{[1-9]}}
-; THUMB: t4
-; THUMB: movw r{{[1-9]}}, #65526
-; THUMB: movt r{{[1-9]}}, #65535
+; THUMB-LABEL: t4
+; THUMB: mvn [[REG:r[1-9]+]], #9
 ; THUMB: cmp r0, #0
 ; THUMB: it eq
-; THUMB: mvneq r{{[1-9]}}, #0
-; THUMB: mov r0, r{{[1-9]}}
+; THUMB: mvneq [[REG]], #0
+; THUMB: mov r0, [[REG]]
   %0 = select i1 %c, i32 -10, i32 -1
   ret i32 %0
 }
diff --git a/test/CodeGen/ARM/fast-isel-vararg.ll b/test/CodeGen/ARM/fast-isel-vararg.ll
index 0b7b0bd..3ff2b15 100644
--- a/test/CodeGen/ARM/fast-isel-vararg.ll
+++ b/test/CodeGen/ARM/fast-isel-vararg.ll
@@ -29,7 +29,6 @@ entry:
 ; ARM: bl {{_?CallVariadic}}
 ; THUMB: sub sp, #32
 ; THUMB: movs r0, #5
-; THUMB: movt r0, #0
 ; THUMB: ldr r1, [sp, #28]
 ; THUMB: ldr r2, [sp, #24]
 ; THUMB: ldr r3, [sp, #20]
diff --git a/test/CodeGen/ARM/fnegs.ll b/test/CodeGen/ARM/fnegs.ll
index 36af835..65fe9e3 100644
--- a/test/CodeGen/ARM/fnegs.ll
+++ b/test/CodeGen/ARM/fnegs.ll
@@ -1,9 +1,12 @@
 ; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - \
 ; RUN:  | FileCheck %s -check-prefix=VFP2
 
-; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \
+; RUN: llc -mtriple=arm-eabi -mattr=+neon,-neonfp %s -o - \
 ; RUN:  | FileCheck %s -check-prefix=NFP0
 
+; RUN: llc -mtriple=arm-eabi -mattr=+neon,+neonfp %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NFP1
+
 ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \
 ; RUN:  | FileCheck %s -check-prefix=CORTEXA8
 
@@ -70,3 +73,49 @@ entry:
 ; CORTEXA9-LABEL: test2:
 ; CORTEXA9: 	vneg.f32	s{{.*}}, s{{.*}}
 
+; If we're bitcasting an integer to an FP vector, we should avoid the FP/vector unit entirely.
+; Make sure that we're flipping the sign bit and only the sign bit of each float (PR20354).
+; So instead of something like this:
+;    vmov     d16, r0, r1
+;    vneg.f32 d16, d16
+;    vmov     r0, r1, d16
+;
+; We should generate:
+;    eor     r0, r0, #-214783648
+;    eor     r1, r1, #-214783648
+
+define <2 x float> @fneg_bitcast(i64 %i) {
+  %bitcast = bitcast i64 %i to <2 x float>
+  %fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast
+  ret <2 x float> %fneg
+}
+; VFP2-LABEL: fneg_bitcast:
+; VFP2-DAG: eor r0, r0, #-2147483648
+; VFP2-DAG: eor r1, r1, #-2147483648
+; VFP2-NOT:  vneg.f32
+
+; NFP1-LABEL: fneg_bitcast:
+; NFP1-DAG: eor r0, r0, #-2147483648
+; NFP1-DAG: eor r1, r1, #-2147483648
+; NFP1-NOT: vneg.f32
+
+; NFP0-LABEL: fneg_bitcast:
+; NFP0-DAG: eor r0, r0, #-2147483648
+; NFP0-DAG: eor r1, r1, #-2147483648
+; NFP0-NOT: vneg.f32
+
+; CORTEXA8-LABEL: fneg_bitcast:
+; CORTEXA8-DAG: eor r0, r0, #-2147483648
+; CORTEXA8-DAG: eor r1, r1, #-2147483648
+; CORTEXA8-NOT:         vneg.f32
+
+; CORTEXA8U-LABEL: fneg_bitcast:
+; CORTEXA8U-DAG: eor r0, r0, #-2147483648
+; CORTEXA8U-DAG: eor r1, r1, #-2147483648
+; CORTEXA8U-NOT:        vneg.f32
+
+; CORTEXA9-LABEL: fneg_bitcast:
+; CORTEXA9-DAG: eor r0, r0, #-2147483648
+; CORTEXA9-DAG: eor r1, r1, #-2147483648
+; CORTEXA9-NOT:         vneg.f32
+
diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll
index eb0120f..514d4a9 100644
--- a/test/CodeGen/ARM/fold-stack-adjust.ll
+++ b/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -167,9 +167,9 @@ end:
 define void @test_varsize(...) minsize {
 ; CHECK-T1-LABEL: test_varsize:
 ; CHECK-T1: sub	sp, #16
-; CHECK-T1: push	{r2, r3, r4, r5, r7, lr}
+; CHECK-T1: push	{r5, r6, r7, lr}
 ; ...
-; CHECK-T1: pop	{r2, r3, r4, r5, r7}
+; CHECK-T1: pop	{r2, r3, r7}
 ; CHECK-T1: pop	{r3}
 ; CHECK-T1: add	sp, #16
 ; CHECK-T1: bx	r3
@@ -183,6 +183,7 @@ define void @test_varsize(...) minsize {
 ; CHECK: bx	lr
 
   %var = alloca i8, i32 8
+  call void @llvm.va_start(i8* %var)
   call void @bar(i8* %var)
   ret void
 }
@@ -216,3 +217,5 @@ if.then:                                          ; preds = %entry
 exit:                                             ; preds = %if.then, %entry
   ret float %call1
 }
+
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/ARM/fp16.ll b/test/CodeGen/ARM/fp16.ll
index fba7946..5a926ac 100644
--- a/test/CodeGen/ARM/fp16.ll
+++ b/test/CodeGen/ARM/fp16.ll
@@ -1,32 +1,84 @@
 ; RUN: llc < %s | FileCheck %s
 ; RUN: llc -mattr=+vfp3,+fp16 < %s | FileCheck --check-prefix=CHECK-FP16 %s
+; RUN: llc -mtriple=armv8-eabihf < %s | FileCheck --check-prefix=CHECK-ARMV8 %s
+; RUN: llc -mtriple=thumbv7m-eabi < %s | FileCheck --check-prefix=CHECK-SOFTFLOAT %s
+
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32"
-target triple = "armv7-eabi"
+target triple = "armv7---eabihf"
 
 @x = global i16 12902
 @y = global i16 0
 @z = common global i16 0
 
-define arm_aapcs_vfpcc void @foo() nounwind {
+define void @foo() nounwind {
 ; CHECK-LABEL: foo:
 ; CHECK-FP16-LABEL: foo:
+; CHECK-ARMV8-LABEL: foo:
+; CHECK-SOFTFLOAT-LABEL: foo:
 entry:
   %0 = load i16* @x, align 2
   %1 = load i16* @y, align 2
-  %2 = tail call float @llvm.convert.from.fp16(i16 %0)
+  %2 = tail call float @llvm.convert.from.fp16.f32(i16 %0)
 ; CHECK: __gnu_h2f_ieee
 ; CHECK-FP16: vcvtb.f32.f16
-  %3 = tail call float @llvm.convert.from.fp16(i16 %1)
+; CHECK-ARMv8: vcvtb.f32.f16
+; CHECK-SOFTFLOAT: __gnu_h2f_ieee
+  %3 = tail call float @llvm.convert.from.fp16.f32(i16 %1)
 ; CHECK: __gnu_h2f_ieee
 ; CHECK-FP16: vcvtb.f32.f16
+; CHECK-ARMV8: vcvtb.f32.f16
+; CHECK-SOFTFLOAT: __gnu_h2f_ieee
   %4 = fadd float %2, %3
-  %5 = tail call i16 @llvm.convert.to.fp16(float %4)
+  %5 = tail call i16 @llvm.convert.to.fp16.f32(float %4)
 ; CHECK: __gnu_f2h_ieee
 ; CHECK-FP16: vcvtb.f16.f32
+; CHECK-ARMV8: vcvtb.f16.f32
+; CHECK-SOFTFLOAT: __gnu_f2h_ieee
   store i16 %5, i16* @x, align 2
   ret void
 }
 
-declare float @llvm.convert.from.fp16(i16) nounwind readnone
+define double @test_from_fp16(i16 %in) {
+; CHECK-LABEL: test_from_fp16:
+; CHECK-FP16-LABEL: test_from_fp16:
+; CHECK-ARMV8-LABEL: test_from_fp16:
+; CHECK-SOFTFLOAT-LABEL: test_from_fp16:
+  %val = call double @llvm.convert.from.fp16.f64(i16 %in)
+; CHECK: bl __gnu_h2f_ieee
+; CHECK: vmov [[TMP:s[0-9]+]], r0
+; CHECK: vcvt.f64.f32 d0, [[TMP]]
+
+; CHECK-FP16: vmov [[TMP16:s[0-9]+]], r0
+; CHECK-FP16: vcvtb.f32.f16 [[TMP32:s[0-9]+]], [[TMP16]]
+; CHECK-FP16: vcvt.f64.f32 d0, [[TMP32]]
+
+; CHECK-ARMV8: vmov [[TMP:s[0-9]+]], r0
+; CHECK-ARMV8: vcvtb.f64.f16 d0, [[TMP]]
+
+; CHECK-SOFTFLOAT: bl __gnu_h2f_ieee
+; CHECK-SOFTFLOAT: bl __aeabi_f2d
+  ret double %val
+}
+
+define i16 @test_to_fp16(double %in) {
+; CHECK-LABEL: test_to_fp16:
+; CHECK-FP16-LABEL: test_to_fp16:
+; CHECK-ARMV8-LABEL: test_to_fp16:
+; CHECK-SOFTFLOAT-LABEL: test_to_fp16:
+  %val = call i16 @llvm.convert.to.fp16.f64(double %in)
+; CHECK: bl __aeabi_d2h
+
+; CHECK-FP16: bl __aeabi_d2h
+
+; CHECK-ARMV8: vcvtb.f16.f64 [[TMP:s[0-9]+]], d0
+; CHECK-ARMV8: vmov r0, [[TMP]]
+
+; CHECK-SOFTFLOAT: bl __aeabi_d2h
+  ret i16 %val
+}
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
 
-declare i16 @llvm.convert.to.fp16(float) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone
diff --git a/test/CodeGen/ARM/fpcmp-f64-neon-opt.ll b/test/CodeGen/ARM/fpcmp-f64-neon-opt.ll
new file mode 100644
index 0000000..7444a68
--- /dev/null
+++ b/test/CodeGen/ARM/fpcmp-f64-neon-opt.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=linux-arm-gnueabihf -mattr=+neon %s -o - | FileCheck %s
+
+; Check that no intermediate integer register is used.
+define i32 @no-intermediate-register-for-zero-imm(double %x) #0 {
+entry:
+; CHECK-LABEL: no-intermediate-register-for-zero-imm
+; CHECK-NOT: vmov
+; CHECK: vcmp
+  %cmp = fcmp une double %x, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/ARM/half.ll b/test/CodeGen/ARM/half.ll
new file mode 100644
index 0000000..10cebb3
--- /dev/null
+++ b/test/CodeGen/ARM/half.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OLD
+; RUN: llc < %s -mtriple=thumbv7s-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-F16
+; RUN: llc < %s -mtriple=thumbv8-apple-ios7.0 | FileCheck %s --check-prefix=CHECK  --check-prefix=CHECK-V8
+
+define void @test_load_store(half* %in, half* %out) {
+; CHECK-LABEL: test_load_store:
+; CHECK: ldrh [[TMP:r[0-9]+]], [r0]
+; CHECK: strh [[TMP]], [r1]
+  %val = load half* %in
+  store half %val, half* %out
+  ret void
+}
+
+define i16 @test_bitcast_from_half(half* %addr) {
+; CHECK-LABEL: test_bitcast_from_half:
+; CHECK: ldrh r0, [r0]
+  %val = load half* %addr
+  %val_int = bitcast half %val to i16
+  ret i16 %val_int
+}
+
+define void @test_bitcast_to_half(half* %addr, i16 %in) {
+; CHECK-LABEL: test_bitcast_to_half:
+; CHECK: strh r1, [r0]
+  %val_fp = bitcast i16 %in to half
+  store half %val_fp, half* %addr
+  ret void
+}
+
+define float @test_extend32(half* %addr) {
+; CHECK-LABEL: test_extend32:
+
+; CHECK-OLD: b.w ___gnu_h2f_ieee
+; CHECK-F16: vcvtb.f32.f16
+; CHECK-V8: vcvtb.f32.f16
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to float
+  ret float %val32
+}
+
+define double @test_extend64(half* %addr) {
+; CHECK-LABEL: test_extend64:
+
+; CHECK-OLD: blx ___gnu_h2f_ieee
+; CHECK-OLD: vcvt.f64.f32
+; CHECK-F16: vcvtb.f32.f16
+; CHECK-F16: vcvt.f64.f32
+; CHECK-V8: vcvtb.f64.f16
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to double
+  ret double %val32
+}
+
+define void @test_trunc32(float %in, half* %addr) {
+; CHECK-LABEL: test_trunc32:
+
+; CHECK-OLD: blx ___gnu_f2h_ieee
+; CHECK-F16: vcvtb.f16.f32
+; CHECK-V8: vcvtb.f16.f32
+  %val16 = fptrunc float %in to half
+  store half %val16, half* %addr
+  ret void
+}
+
+define void @test_trunc64(double %in, half* %addr) {
+; CHECK-LABEL: test_trunc64:
+
+; CHECK-OLD: blx ___truncdfhf2
+; CHECK-F16: blx ___truncdfhf2
+; CHECK-V8: vcvtb.f16.f64
+  %val16 = fptrunc double %in to half
+  store half %val16, half* %addr
+  ret void
+}
diff --git a/test/CodeGen/ARM/inlineasm-global.ll b/test/CodeGen/ARM/inlineasm-global.ll
new file mode 100644
index 0000000..fd210f4
--- /dev/null
+++ b/test/CodeGen/ARM/inlineasm-global.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=thumb-unknown-unknown -no-integrated-as < %s | FileCheck %s --check-prefix=THUMB
+; RUN: llc -mtriple=arm-unknown-unknown -no-integrated-as < %s | FileCheck %s --check-prefix=ARM
+
+; In thumb mode, emit ".code 16" before global inline-asm instructions.
+
+; THUMB: .code 16
+; THUMB: stmib
+; THUMB: .code 16
+
+; ARM-NOT: .code 16
+; ARM:     stmib
+
+module asm "stmib sp, {r0-r14};"
diff --git a/test/CodeGen/ARM/interrupt-attr.ll b/test/CodeGen/ARM/interrupt-attr.ll
index cb67dd9..96d1ee2 100644
--- a/test/CodeGen/ARM/interrupt-attr.ll
+++ b/test/CodeGen/ARM/interrupt-attr.ll
@@ -40,7 +40,7 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
 ; CHECK-M: mov r4, sp
 ; CHECK-M: bic r4, r4, #7
 ; CHECK-M: mov sp, r4
-; CHECK-M: blx _bar
+; CHECK-M: bl _bar
 ; CHECK-M: sub.w r4, r11, #8
 ; CHECK-M: mov sp, r4
 ; CHECK-M: pop.w {r4, r10, r11, pc}
diff --git a/test/CodeGen/ARM/invalid-target.ll b/test/CodeGen/ARM/invalid-target.ll
new file mode 100644
index 0000000..bb0ada4
--- /dev/null
+++ b/test/CodeGen/ARM/invalid-target.ll
@@ -0,0 +1,32 @@
+; RUN: not llc -mtriple armvinvalid-linux-gnueabi %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=ARMVINVALID
+
+; RUN: not llc -mtriple armebvinvalid-linux-gnueabi %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=ARMEBVINVALID
+
+; RUN: not llc -mtriple thumbvinvalid-linux-gnueabi %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=THUMBVINVALID
+
+; RUN: not llc -mtriple thumbebvinvalid-linux-gnueabi %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=THUMBEBVINVALID
+
+; RUN: not llc -mtriple thumbv2-linux-gnueabi %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=THUMBV2
+
+; RUN: not llc -mtriple thumbv3-linux-gnueabi %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=THUMBV3
+
+; RUN: not llc -mtriple arm64invalid-linux-gnu %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=ARM64INVALID
+
+; RUN: not llc -mtriple aarch64invalid-linux-gnu %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=AARCH64INVALID
+
+; ARMVINVALID: error: unable to get target for 'armvinvalid--linux-gnueabi'
+; ARMEBVINVALID: error: unable to get target for 'armebvinvalid--linux-gnueabi'
+; THUMBVINVALID: error: unable to get target for 'thumbvinvalid--linux-gnueabi'
+; THUMBEBVINVALID: error: unable to get target for 'thumbebvinvalid--linux-gnueabi'
+; THUMBV2: error: unable to get target for 'thumbv2--linux-gnueabi'
+; THUMBV3: error: unable to get target for 'thumbv3--linux-gnueabi'
+; ARM64INVALID: error: unable to get target for 'arm64invalid--linux-gnu'
+; AARCH64INVALID: error: unable to get target for 'aarch64invalid--linux-gnu'
diff --git a/test/CodeGen/ARM/jump_tables.ll b/test/CodeGen/ARM/jump_tables.ll
deleted file mode 100644
index 907a86c..0000000
--- a/test/CodeGen/ARM/jump_tables.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc <%s -mtriple=arm-unknown-linux-gnueabi -jump-table-type=single | FileCheck --check-prefix=ARM %s
-; RUN: llc <%s -mtriple=thumb-unknown-linux-gnueabi -jump-table-type=single | FileCheck --check-prefix=THUMB %s
-
-define void @indirect_fun() unnamed_addr jumptable {
-  ret void
-}
-define void ()* @get_fun() {
-  ret void ()* @indirect_fun
-
-; ARM:         ldr     r0, [[LABEL:.*]]
-; ARM:         mov     pc, lr
-; ARM: [[LABEL]]:
-; ARM:         .long   __llvm_jump_instr_table_0_1
-
-; THUMB:         ldr     r0, [[LABEL:.*]]
-; THUMB:         bx      lr
-; THUMB: [[LABEL]]:
-; THUMB:         .long   __llvm_jump_instr_table_0_1
-}
-
-; ARM:         .globl  __llvm_jump_instr_table_0_1
-; ARM:         .align  3
-; ARM:         .type   __llvm_jump_instr_table_0_1,%function
-; ARM: __llvm_jump_instr_table_0_1:
-; ARM:         b     indirect_fun(PLT)
-
-; THUMB:         .globl  __llvm_jump_instr_table_0_1
-; THUMB:         .align  3
-; THUMB:         .thumb_func
-; THUMB:         .type   __llvm_jump_instr_table_0_1,%function
-; THUMB: __llvm_jump_instr_table_0_1:
-; THUMB:         b     indirect_fun(PLT)
diff --git a/test/CodeGen/ARM/negative-offset.ll b/test/CodeGen/ARM/negative-offset.ll
new file mode 100644
index 0000000..7b949fd
--- /dev/null
+++ b/test/CodeGen/ARM/negative-offset.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=arm-eabi -O3 %s -o - | FileCheck %s
+
+; Function Attrs: nounwind readonly
+define arm_aapcscc i32 @sum(i32* nocapture readonly %p) #0 {
+entry:
+;CHECK-LABEL: sum:
+;CHECK-NOT: sub
+;CHECK: ldr r{{.*}}, [r0, #-16]
+;CHECK: ldr r{{.*}}, [r0, #-8]
+  %arrayidx = getelementptr inbounds i32* %p, i32 -4
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %p, i32 -2
+  %1 = load i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, %0
+  ret i32 %add
+}
+
diff --git a/test/CodeGen/ARM/no-tail-call.ll b/test/CodeGen/ARM/no-tail-call.ll
new file mode 100644
index 0000000..3a8cb21
--- /dev/null
+++ b/test/CodeGen/ARM/no-tail-call.ll
@@ -0,0 +1,84 @@
+; RUN: llc < %s -O0 -o - | FileCheck %s
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "armv7s-apple-ios7"
+
+%foo = type <{ %Sf }>
+%Sf = type <{ float }>
+
+declare float @llvm.ceil.f32(float) 
+
+; Check that we are not emitting a tail call for the last call to ceil.
+; This function returns three different results.
+; CHECK-LABEL: func1:
+; CHECK-NOT: b _ceilf
+; CHECK: pop
+define { float, float, float } @func1() {
+entry:
+  %0 = alloca %foo, align 4
+  %1 = alloca %foo, align 4
+  %2 = alloca %foo, align 4
+  %.native = getelementptr inbounds %foo* %0, i32 0, i32 0
+  %.native.value = getelementptr inbounds %Sf* %.native, i32 0, i32 0
+  store float 0.000000e+00, float* %.native.value, align 4
+  %.native1 = getelementptr inbounds %foo* %1, i32 0, i32 0
+  %.native1.value = getelementptr inbounds %Sf* %.native1, i32 0, i32 0
+  store float 1.000000e+00, float* %.native1.value, align 4
+  %.native2 = getelementptr inbounds %foo* %2, i32 0, i32 0
+  %.native2.value = getelementptr inbounds %Sf* %.native2, i32 0, i32 0
+  store float 5.000000e+00, float* %.native2.value, align 4
+  br i1 true, label %3, label %4
+
+; <label>:3                                       ; preds = %entry
+  %.native4 = getelementptr inbounds %foo* %1, i32 0, i32 0
+  %.native4.value = getelementptr inbounds %Sf* %.native4, i32 0, i32 0
+  store float 2.000000e+00, float* %.native4.value, align 4
+  br label %4
+
+; <label>:4                                       ; preds = %3, %entry
+  %5 = call float @llvm.ceil.f32(float 5.000000e+00)
+  %.native3 = getelementptr inbounds %foo* %1, i32 0, i32 0
+  %.native3.value = getelementptr inbounds %Sf* %.native3, i32 0, i32 0
+  %6 = load float* %.native3.value, align 4
+  %7 = call float @llvm.ceil.f32(float %6)
+  %8 = insertvalue { float, float, float } { float 0.000000e+00, float undef, float undef }, float %5, 1
+  %9 = insertvalue { float, float, float } %8, float %7, 2
+  ret { float, float, float } %9
+}
+
+; Check that we are not emitting a tail call for the last call to ceil.
+; This function returns two different results.
+; CHECK-LABEL: func2:
+; CHECK-NOT: b _ceilf
+; CHECK: pop
+define { float, float } @func2() {
+entry:
+  %0 = alloca %foo, align 4
+  %1 = alloca %foo, align 4
+  %2 = alloca %foo, align 4
+  %.native = getelementptr inbounds %foo* %0, i32 0, i32 0
+  %.native.value = getelementptr inbounds %Sf* %.native, i32 0, i32 0
+  store float 0.000000e+00, float* %.native.value, align 4
+  %.native1 = getelementptr inbounds %foo* %1, i32 0, i32 0
+  %.native1.value = getelementptr inbounds %Sf* %.native1, i32 0, i32 0
+  store float 1.000000e+00, float* %.native1.value, align 4
+  %.native2 = getelementptr inbounds %foo* %2, i32 0, i32 0
+  %.native2.value = getelementptr inbounds %Sf* %.native2, i32 0, i32 0
+  store float 5.000000e+00, float* %.native2.value, align 4
+  br i1 true, label %3, label %4
+
+; <label>:3                                       ; preds = %entry
+  %.native4 = getelementptr inbounds %foo* %1, i32 0, i32 0
+  %.native4.value = getelementptr inbounds %Sf* %.native4, i32 0, i32 0
+  store float 2.000000e+00, float* %.native4.value, align 4
+  br label %4
+
+; <label>:4                                       ; preds = %3, %entry
+  %5 = call float @llvm.ceil.f32(float 5.000000e+00)
+  %.native3 = getelementptr inbounds %foo* %1, i32 0, i32 0
+  %.native3.value = getelementptr inbounds %Sf* %.native3, i32 0, i32 0
+  %6 = load float* %.native3.value, align 4
+  %7 = call float @llvm.ceil.f32(float %6)
+  %8 = insertvalue { float, float } { float 0.000000e+00, float undef }, float %7, 1
+  ret { float, float } %8
+}
+
diff --git a/test/CodeGen/ARM/none-macho-v4t.ll b/test/CodeGen/ARM/none-macho-v4t.ll
new file mode 100644
index 0000000..4c6e68e
--- /dev/null
+++ b/test/CodeGen/ARM/none-macho-v4t.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=thumb-none-macho -mcpu=arm7tdmi %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-none-macho -mcpu=arm7tdmi %s -filetype=obj -o /dev/null
+
+declare void @callee()
+
+define void @test_call() {
+  ; BX can only take a register before v5t came along, so we must materialise
+  ; the address properly.
+; CHECK-LABEL: test_call:
+; CHECK: ldr r[[CALLEE_STUB:[0-9]+]], [[LITPOOL:LCPI[0-9]+_[0-9]+]]
+; CHECK: [[PC_LABEL:LPC[0-9]+_[0-9]+]]:
+; CHECK-NEXT: add r[[CALLEE_STUB]], pc
+; CHECK: ldr [[CALLEE:r[0-9]+]], [r[[CALLEE_STUB]]]
+; CHECK: mov lr, pc
+; CHECK: bx [[CALLEE]]
+
+; CHECK: [[LITPOOL]]:
+; CHECK-NEXT: .long L_callee$non_lazy_ptr-([[PC_LABEL]]+4)
+  call void @callee()
+  ret void
+}
diff --git a/test/CodeGen/ARM/none-macho.ll b/test/CodeGen/ARM/none-macho.ll
index 60c2171..2a7878f 100644
--- a/test/CodeGen/ARM/none-macho.ll
+++ b/test/CodeGen/ARM/none-macho.ll
@@ -84,7 +84,7 @@ define float @test_softfloat_calls(float %in) {
 
   ; Soft-float calls should be GNU-style rather than RTABI and should not be the
   ; *vfp variants used for ARMv6 iOS.
-; CHECK: blx ___addsf3{{$}}
+; CHECK: bl ___addsf3{{$}}
   ret float %sum
 }
 
diff --git a/test/CodeGen/ARM/out-of-registers.ll b/test/CodeGen/ARM/out-of-registers.ll
new file mode 100644
index 0000000..790e416
--- /dev/null
+++ b/test/CodeGen/ARM/out-of-registers.ll
@@ -0,0 +1,42 @@
+; RUN: llc -O3 %s -o - | FileCheck %s
+; ModuleID = 'fo.c'
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n8:16:32-S64"
+target triple = "thumbv7-none-linux-gnueabi"
+
+; CHECK: vpush
+; CHECK: vpop
+
+define void @foo(float* nocapture %A) #0 {
+  %1= bitcast float* %A to i8*
+  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
+  %divp_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %3
+  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
+  %div3p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %4
+  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2
+  %div8p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %5
+  %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
+  %div13p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %6
+  tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4)
+ ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1
+
+; Function Attrs: nounwind readonly
+
+; Function Attrs: nounwind
+declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) #2
+
+; Function Attrs: nounwind
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readonly }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"Snapdragon LLVM ARM Compiler 3.4"}
+!1 = metadata !{metadata !1}
diff --git a/test/CodeGen/ARM/pr18364-movw.ll b/test/CodeGen/ARM/pr18364-movw.ll
new file mode 100644
index 0000000..fdcf154
--- /dev/null
+++ b/test/CodeGen/ARM/pr18364-movw.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=armv5te | FileCheck %s --check-prefix=V5
+; RUN: llc < %s -mtriple=armv6   | FileCheck %s --check-prefix=V6
+; RUN: llc < %s -mtriple=armv6t2 | FileCheck %s --check-prefix=V6T2
+; RUN: llc < %s -mtriple=armv7   | FileCheck %s --check-prefix=V7
+; PR18364
+
+define i64 @f() #0 {
+entry:
+; V5-NOT: movw
+; V6-NOT: movw
+; V6T2: movw
+; V7: movw
+  %y = alloca i64, align 8
+  %z = alloca i64, align 8
+  store i64 1, i64* %y, align 8
+  store i64 11579764786944, i64* %z, align 8
+  %0 = load i64* %y, align 8
+  %1 = load i64* %z, align 8
+  %sub = sub i64 %0, %1
+  ret i64 %sub
+}
+
+define i64 @g(i64 %a, i32 %b) #0 {
+entry:
+; V5-NOT: movw
+; V6-NOT: movw
+; V6T2: movw
+; V7: movw
+  %0 = mul i64 %a, 86400000
+  %mul = add i64 %0, -210866803200000
+  %conv = sext i32 %b to i64
+  %add = add nsw i64 %mul, %conv
+  ret i64 %add
+}
diff --git a/test/CodeGen/ARM/preferred-align.ll b/test/CodeGen/ARM/preferred-align.ll
new file mode 100644
index 0000000..8cd4ef6
--- /dev/null
+++ b/test/CodeGen/ARM/preferred-align.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=armv7-linux-gnueabi %s -o - | FileCheck %s
+
+@var_agg = global {i8, i8} zeroinitializer
+
+; CHECK: .globl var_agg
+; CHECK-NEXT: .align 2
+
+@var1 = global i1 zeroinitializer
+
+; CHECK: .globl var1
+; CHECK-NOT: .align
+
+@var8 = global i8 zeroinitializer
+
+; CHECK: .globl var8
+; CHECK-NOT: .align
+
+@var16 = global i16 zeroinitializer
+
+; CHECK: .globl var16
+; CHECK-NEXT: .align 1
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/prefetch.ll b/test/CodeGen/ARM/prefetch.ll
index 7350e0a..7fdc5b6 100644
--- a/test/CodeGen/ARM/prefetch.ll
+++ b/test/CodeGen/ARM/prefetch.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=thumb-eabi -mattr=-thumb2 %s -o - | FileCheck %s -check-prefix CHECK-T1
 ; RUN: llc -mtriple=thumb-eabi -mattr=+v7 %s -o - | FileCheck %s -check-prefix=THUMB2
 ; RUN: llc -mtriple=arm-eabi -mattr=+v7 %s -o - | FileCheck %s -check-prefix=ARM
-; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9-mp %s -o - | FileCheck %s -check-prefix=ARM-MP
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=ARM-MP
 ; rdar://8601536
 
 ; CHECK-T1-NOT: pld
diff --git a/test/CodeGen/ARM/rbit.ll b/test/CodeGen/ARM/rbit.ll
new file mode 100644
index 0000000..41f866f
--- /dev/null
+++ b/test/CodeGen/ARM/rbit.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=armv8-eabi %s -o - | FileCheck %s
+
+; CHECK-LABEL: rbit
+; CHECK: rbit r0, r0
+define i32 @rbit(i32 %t) {
+entry:
+  %rbit = call i32 @llvm.arm.rbit(i32 %t)
+  ret i32 %rbit
+}
+
+; CHECK-LABEL: rbit_constant
+; CHECK: mov r0, #0
+; CHECK: rbit r0, r0
+define i32 @rbit_constant() {
+entry:
+  %rbit.i = call i32 @llvm.arm.rbit(i32 0)
+  ret i32 %rbit.i
+}
+
+declare i32 @llvm.arm.rbit(i32)
diff --git a/test/CodeGen/ARM/sbfx.ll b/test/CodeGen/ARM/sbfx.ll
index 3c25edc..5b77c59 100644
--- a/test/CodeGen/ARM/sbfx.ll
+++ b/test/CodeGen/ARM/sbfx.ll
@@ -45,3 +45,21 @@ entry:
     %tmp2 = ashr i32 %tmp, 1
     ret i32 %tmp2
 }
+
+define signext i8 @f6(i32 %a) {
+; CHECK-LABEL: f6:
+; CHECK: sbfx r0, r0, #23, #8
+
+  %tmp = lshr i32 %a, 23
+  %res = trunc i32 %tmp to i8
+  ret i8 %res
+}
+
+define signext i8 @f7(i32 %a) {
+; CHECK-LABEL: f7:
+; CHECK-NOT: sbfx
+
+  %tmp = lshr i32 %a, 25
+  %res = trunc i32 %tmp to i8
+  ret i8 %res
+}
diff --git a/test/CodeGen/ARM/select_xform.ll b/test/CodeGen/ARM/select_xform.ll
index e13504a..326eb51 100644
--- a/test/CodeGen/ARM/select_xform.ll
+++ b/test/CodeGen/ARM/select_xform.ll
@@ -222,3 +222,110 @@ entry:
   %add = add i32 %conv, %c
   ret i32 %add
 }
+
+; Do not fold the xor into the select
+define i32 @t15(i32 %p) {
+entry:
+; ARM-LABEL: t15:
+; ARM: mov     [[REG:r[0-9]+]], #2
+; ARM: cmp     r0, #8
+; ARM: movwgt  [[REG:r[0-9]+]], #1
+; ARM: eor     r0, [[REG:r[0-9]+]], #1
+
+; T2-LABEL: t15:
+; T2: movs    [[REG:r[0-9]+]], #2
+; T2: cmp     [[REG:r[0-9]+]], #8
+; T2: it      gt
+; T2: movgt   [[REG:r[0-9]+]], #1
+; T2: eor     r0, [[REG:r[0-9]+]], #1
+  %cmp = icmp sgt i32 %p, 8
+  %a = select i1 %cmp, i32 1, i32 2
+  %xor = xor i32 %a, 1
+  ret i32 %xor
+}
+
+define i32 @t16(i32 %x, i32 %y) {
+entry:
+; ARM-LABEL: t16:
+; ARM: and r0, {{r[0-9]+}}, {{r[0-9]+}}
+
+; T2-LABEL: t16:
+; T2: ands r0, {{r[0-9]+}}
+  %cmp = icmp eq i32 %x, 0
+  %cond = select i1 %cmp, i32 5, i32 2
+  %cmp1 = icmp eq i32 %y, 0
+  %cond2 = select i1 %cmp1, i32 3, i32 4
+  %and = and i32 %cond2, %cond
+  ret i32 %and
+}
+
+define i32 @t17(i32 %x, i32 %y) #0 {
+entry:
+; ARM-LABEL: t17:
+; ARM: and r0, {{r[0-9]+}}, {{r[0-9]+}}
+
+; T2-LABEL: t17:
+; T2: ands r0, {{r[0-9]+}}
+  %cmp = icmp eq i32 %x, -1
+  %cond = select i1 %cmp, i32 5, i32 2
+  %cmp1 = icmp eq i32 %y, -1
+  %cond2 = select i1 %cmp1, i32 3, i32 4
+  %and = and i32 %cond2, %cond
+  ret i32 %and
+}
+
+define i32 @t18(i32 %x, i32 %y) #0 {
+entry:
+; ARM-LABEL: t18:
+; ARM: and r0, {{r[0-9]+}}, {{r[0-9]+}}
+
+; T2-LABEL: t18:
+; T2: and.w r0, {{r[0-9]+}}
+  %cmp = icmp ne i32 %x, 0
+  %cond = select i1 %cmp, i32 5, i32 2
+  %cmp1 = icmp ne i32 %x, -1
+  %cond2 = select i1 %cmp1, i32 3, i32 4
+  %and = and i32 %cond2, %cond
+  ret i32 %and
+}
+
+define i32 @t19(i32 %x, i32 %y) #0 {
+entry:
+; ARM-LABEL: t19:
+; ARM: orr r0, {{r[0-9]+}}, {{r[0-9]+}}
+
+; T2-LABEL: t19:
+; T2: orrs r0, {{r[0-9]+}}
+  %cmp = icmp ne i32 %x, 0
+  %cond = select i1 %cmp, i32 5, i32 2
+  %cmp1 = icmp ne i32 %y, 0
+  %cond2 = select i1 %cmp1, i32 3, i32 4
+  %or = or i32 %cond2, %cond
+  ret i32 %or
+}
+
+define i32 @t20(i32 %x, i32 %y) #0 {
+entry:
+; ARM-LABEL: t20:
+; ARM: orr r0, {{r[0-9]+}}, {{r[0-9]+}}
+
+; T2-LABEL: t20:
+; T2: orrs r0, {{r[0-9]+}}
+  %cmp = icmp ne i32 %x, -1
+  %cond = select i1 %cmp, i32 5, i32 2
+  %cmp1 = icmp ne i32 %y, -1
+  %cond2 = select i1 %cmp1, i32 3, i32 4
+  %or = or i32 %cond2, %cond
+  ret i32 %or
+}
+
+define  <2 x i32> @t21(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK-LABEL: t21:
+; CHECK-NOT: eor
+; CHECK: mvn
+; CHECK-NOT: eor
+  %tst = icmp eq <2 x i32> %lhs, %rhs
+  %ntst = xor <2 x i1> %tst, <i1 1 , i1 undef>
+  %btst = sext <2 x i1> %ntst to <2 x i32>
+  ret <2 x i32> %btst
+}
diff --git a/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll b/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll
new file mode 100644
index 0000000..3cf2a08
--- /dev/null
+++ b/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=armv7-apple-ios -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=armv7-apple-ios -O1 < %s | FileCheck %s
+; RUN: llc -mtriple=armv7-apple-ios -O2 < %s | FileCheck %s
+; RUN: llc -mtriple=armv7-apple-ios -O3 < %s | FileCheck %s
+
+; SjLjEHPrepare shouldn't crash when lowering empty structs.
+;
+; Checks that between in case of empty structs used as arguments
+; nothing happens, i.e. there are no instructions between
+; __Unwind_SjLj_Register and actual @bar invocation
+
+
+define i8* @foo(i8 %a, {} %c) {
+entry:
+; CHECK: bl __Unwind_SjLj_Register
+; CHECK-NEXT: {{[A-Z][a-zA-Z0-9]*}}:
+; CHECK-NEXT: bl _bar
+  invoke void @bar ()
+    to label %unreachable unwind label %handler
+
+unreachable:
+  unreachable
+
+handler:
+  %tmp = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @baz to i8*)
+  cleanup
+  resume { i8*, i32 } undef
+}
+
+declare void @bar()
+declare i32 @baz(...)
diff --git a/test/CodeGen/ARM/smulw.ll b/test/CodeGen/ARM/smulw.ll
new file mode 100644
index 0000000..8653903
--- /dev/null
+++ b/test/CodeGen/ARM/smulw.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=arm--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+
+; We cannot codegen the smulw[bt] or smlaw[bt] instructions for these functions,
+; as the top 16 bits of the result would differ
+
+define i32 @f1(i32 %a, i16 %b) {
+; CHECK-LABEL: f1:
+; CHECK: mul
+; CHECK: asr
+  %tmp1 = sext i16 %b to i32
+  %tmp2 = mul i32 %a, %tmp1
+  %tmp3 = ashr i32 %tmp2, 16
+  ret i32 %tmp3
+}
+
+define i32 @f2(i32 %a, i16 %b, i32 %c) {
+; CHECK-LABEL: f2:
+; CHECK: mul
+; CHECK: add{{.*}}, asr #16
+  %tmp1 = sext i16 %b to i32
+  %tmp2 = mul i32 %a, %tmp1
+  %tmp3 = ashr i32 %tmp2, 16
+  %tmp4 = add i32 %tmp3, %c
+  ret i32 %tmp4
+}
diff --git a/test/CodeGen/ARM/space-directive.ll b/test/CodeGen/ARM/space-directive.ll
new file mode 100644
index 0000000..55be199
--- /dev/null
+++ b/test/CodeGen/ARM/space-directive.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=armv7 -o - %s | FileCheck %s
+
+define i32 @test_space() minsize {
+; CHECK-LABEL: test_space:
+; CHECK: ldr {{r[0-9]+}}, [[CPENTRY:.?LCPI[0-9]+_[0-9]+]]
+; CHECK: b [[PAST_CP:.?LBB[0-9]+_[0-9]+]]
+
+; CHECK: [[CPENTRY]]:
+; CHECK-NEXT: 12345678
+
+; CHECK: [[PAST_CP]]:
+; CHECK: .zero 10000
+  %addr = inttoptr i32 12345678 to i32*
+  %val = load i32* %addr
+  call i32 @llvm.arm.space(i32 10000, i32 undef)
+  ret i32 %val
+}
+
+declare i32 @llvm.arm.space(i32, i32)
diff --git a/test/CodeGen/ARM/stack_guard_remat.ll b/test/CodeGen/ARM/stack_guard_remat.ll
new file mode 100644
index 0000000..b11ea92
--- /dev/null
+++ b/test/CodeGen/ARM/stack_guard_remat.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -mtriple=arm-apple-ios -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -mtriple=arm-apple-ios -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=NO-PIC -check-prefix=STATIC
+; RUN: llc < %s -mtriple=arm-apple-ios -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s  -check-prefix=NO-PIC -check-prefix=DYNAMIC-NO-PIC
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC-V7
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=STATIC-V7
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s -check-prefix=DYNAMIC-NO-PIC-V7
+
+;PIC:   foo2
+;PIC:   ldr [[R0:r[0-9]+]], [[LABEL0:LCPI[0-9_]+]]
+;PIC: [[LABEL1:LPC0_1]]:
+;PIC:   ldr [[R1:r[0-9]+]], [pc, [[R0]]]
+;PIC:   ldr [[R2:r[0-9]+]], {{\[}}[[R1]]{{\]}}
+;PIC:   ldr {{r[0-9]+}}, {{\[}}[[R2]]{{\]}}
+
+;PIC:      [[LABEL0]]:
+;PIC-NEXT:   .long L___stack_chk_guard$non_lazy_ptr-([[LABEL1]]+8)
+
+;NO-PIC: foo2
+;NO-PIC: ldr [[R0:r[0-9]+]], [[LABEL0:LCPI[0-9_]+]]
+;NO-PIC-NOT: LPC
+;NO-PIC: ldr {{r[0-9]+}}, {{\[}}[[R0]]{{\]}}
+
+;STATIC:      [[LABEL0]]:
+;STATIC-NEXT:   .long ___stack_chk_guard
+
+;DYNAMIC-NO-PIC:      [[LABEL0]]:
+;DYNAMIC-NO-PIC-NEXT:   .long L___stack_chk_guard$non_lazy_ptr
+
+;PIC-V7:   movw [[R0:r[0-9]+]], :lower16:(L___stack_chk_guard$non_lazy_ptr-([[LABEL0:LPC[0-9_]+]]+8))
+;PIC-V7:   movt [[R0]], :upper16:(L___stack_chk_guard$non_lazy_ptr-([[LABEL0]]+8))
+;PIC-V7: [[LABEL0]]:
+;PIC-V7:   ldr [[R0]], {{\[}}pc, [[R0]]{{\]}}
+;PIC-V7:   ldr [[R0]], {{\[}}[[R0]]{{\]}}
+
+;PIC-V7: L___stack_chk_guard$non_lazy_ptr:
+;PIC-V7:   .indirect_symbol        ___stack_chk_guard
+
+;STATIC-V7: movw [[R0:r[0-9]+]], :lower16:___stack_chk_guard
+;STATIC-V7: movt [[R0]], :upper16:___stack_chk_guard
+;STATIC-V7: ldr  [[R0]], {{\[}}[[R0]]{{\]}}
+
+;DYNAMIC-NO-PIC-V7: movw [[R0:r[0-9]+]], :lower16:L___stack_chk_guard$non_lazy_ptr
+;DYNAMIC-NO-PIC-V7: movt [[R0]], :upper16:L___stack_chk_guard$non_lazy_ptr
+;DYNAMIC-NO-PIC-V7: ldr  [[R0]], {{\[}}[[R0]]{{\]}}
+;DYNAMIC-NO-PIC-V7: ldr  [[R0]], {{\[}}[[R0]]{{\]}}
+
+;DYNAMIC-NO-PIC-V7: L___stack_chk_guard$non_lazy_ptr:
+;DYNAMIC-NO-PIC-V7:   .indirect_symbol        ___stack_chk_guard
+
+; Function Attrs: nounwind ssp
+define i32 @test_stack_guard_remat() #0 {
+  %a1 = alloca [256 x i32], align 4
+  %1 = bitcast [256 x i32]* %a1 to i8*
+  call void @llvm.lifetime.start(i64 1024, i8* %1)
+  %2 = getelementptr inbounds [256 x i32]* %a1, i32 0, i32 0
+  call void @foo3(i32* %2) #3
+  call void asm sideeffect "foo2", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{sp},~{lr}"()
+  call void @llvm.lifetime.end(i64 1024, i8* %1)
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/ARM/swift-atomics.ll b/test/CodeGen/ARM/swift-atomics.ll
index 1d71815..8b100f1 100644
--- a/test/CodeGen/ARM/swift-atomics.ll
+++ b/test/CodeGen/ARM/swift-atomics.ll
@@ -8,6 +8,7 @@ define void @test_store_release(i32* %p, i32 %v) {
 ; CHECK: dmb ishst
 ; CHECK: str
 
+; CHECK-STRICT-ATOMIC-LABEL: test_store_release:
 ; CHECK-STRICT-ATOMIC: dmb {{ish$}}
   store atomic i32 %v, i32* %p release, align 4
   ret void
@@ -24,7 +25,11 @@ define i32 @test_seq_cst(i32* %p, i32 %v) {
 ; CHECK: ldr
 ; CHECK: dmb {{ish$}}
 
+; CHECK-STRICT-ATOMIC-LABEL: test_seq_cst:
 ; CHECK-STRICT-ATOMIC: dmb {{ish$}}
+; CHECK-STRICT-ATOMIC: str
+; CHECK-STRICT-ATOMIC: dmb {{ish$}}
+; CHECK-STRICT-ATOMIC: ldr
 ; CHECK-STRICT-ATOMIC: dmb {{ish$}}
 
   store atomic i32 %v, i32* %p seq_cst, align 4
@@ -39,6 +44,7 @@ define i32 @test_acq(i32* %addr) {
 ; CHECK: ldr
 ; CHECK: dmb {{ish$}}
 
+; CHECK-STRICT-ATOMIC-LABEL: test_acq:
 ; CHECK-STRICT-ATOMIC: dmb {{ish$}}
   %val = load atomic i32* %addr acquire, align 4
   ret i32 %val
diff --git a/test/CodeGen/ARM/sxt_rot.ll b/test/CodeGen/ARM/sxt_rot.ll
index 5ddea2e..4162691 100644
--- a/test/CodeGen/ARM/sxt_rot.ll
+++ b/test/CodeGen/ARM/sxt_rot.ll
@@ -9,7 +9,8 @@ define i32 @test0(i8 %A) {
 
 define signext i8 @test1(i32 %A) {
 ; CHECK: test1
-; CHECK: sxtb r0, r0, ror #8
+; CHECK: lsr r0, r0, #8
+; CHECK: sxtb r0, r0
   %B = lshr i32 %A, 8
   %C = shl i32 %A, 24
   %D = or i32 %B, %C
diff --git a/test/CodeGen/ARM/tail-call.ll b/test/CodeGen/ARM/tail-call.ll
index 7711586..c3e7965 100644
--- a/test/CodeGen/ARM/tail-call.ll
+++ b/test/CodeGen/ARM/tail-call.ll
@@ -3,6 +3,7 @@
 ; RUN:   | FileCheck %s -check-prefix CHECK-NO-TAIL
 
 declare i32 @callee(i32 %i)
+declare extern_weak fastcc void @callee_weak()
 
 define i32 @caller(i32 %i) {
 entry:
@@ -19,3 +20,12 @@ entry:
 ; CHECK-NO-TAIL: pop {lr}
 ; CHECK-NO-TAIL: bx lr
 
+
+; Weakly-referenced extern functions cannot be tail-called, as AAELF does
+; not define the behaviour of branch instructions to undefined weak symbols.
+define fastcc void @caller_weak() {
+; CHECK-LABEL: caller_weak:
+; CHECK: bl callee_weak
+  tail call void @callee_weak()
+  ret void
+}
diff --git a/test/CodeGen/ARM/tail-merge-branch-weight.ll b/test/CodeGen/ARM/tail-merge-branch-weight.ll
new file mode 100644
index 0000000..9b5d566
--- /dev/null
+++ b/test/CodeGen/ARM/tail-merge-branch-weight.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=arm-apple-ios -print-machineinstrs=branch-folder \
+; RUN: %s -o /dev/null 2>&1 | FileCheck %s
+
+; Branch probability of tailed-merged block:
+;
+; p(L0_L1 -> L2) = p(entry -> L0) * p(L0 -> L2) + p(entry -> L1) * p(L1 -> L2)
+;                = 0.2 * 0.6 + 0.8 * 0.3 = 0.36
+; p(L0_L1 -> L3) = p(entry -> L0) * p(L0 -> L3) + p(entry -> L1) * p(L1 -> L3)
+;                = 0.2 * 0.4 + 0.8 * 0.7 = 0.64
+
+; CHECK: # Machine code for function test0:
+; CHECK: Successors according to CFG: BB#{{[0-9]+}}(13) BB#{{[0-9]+}}(24)
+; CHECK: BB#{{[0-9]+}}:
+; CHECK: BB#{{[0-9]+}}:
+; CHECK: # End machine code for function test0.
+
+define i32 @test0(i32 %n, i32 %m, i32* nocapture %a, i32* nocapture %b) {
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %L0, label %L1, !prof !0
+
+L0:                                          ; preds = %entry
+  store i32 12, i32* %a, align 4
+  store i32 18, i32* %b, align 4
+  %cmp1 = icmp eq i32 %m, 8
+  br i1 %cmp1, label %L2, label %L3, !prof !1
+
+L1:                                          ; preds = %entry
+  store i32 14, i32* %a, align 4
+  store i32 18, i32* %b, align 4
+  %cmp3 = icmp eq i32 %m, 8
+  br i1 %cmp3, label %L2, label %L3, !prof !2
+
+L2:                                               ; preds = %L1, %L0
+  br label %L3
+
+L3:                                           ; preds = %L0, %L1, %L2
+  %retval.0 = phi i32 [ 100, %L2 ], [ 6, %L1 ], [ 6, %L0 ]
+  ret i32 %retval.0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 200, i32 800}
+!1 = metadata !{metadata !"branch_weights", i32 600, i32 400}
+!2 = metadata !{metadata !"branch_weights", i32 300, i32 700}
diff --git a/test/CodeGen/ARM/thumb1-varalloc.ll b/test/CodeGen/ARM/thumb1-varalloc.ll
index e07e8aa..8d5888d 100644
--- a/test/CodeGen/ARM/thumb1-varalloc.ll
+++ b/test/CodeGen/ARM/thumb1-varalloc.ll
@@ -1,13 +1,15 @@
 ; RUN: llc < %s -mtriple=thumbv6-apple-darwin | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv6-apple-darwin -regalloc=basic | FileCheck %s
-; rdar://8819685
+; RUN: llc < %s -o %t -filetype=obj -mtriple=thumbv6-apple-darwin
+; RUN: llvm-objdump -triple=thumbv6-apple-darwin -d %t | FileCheck %s
 
 @__bar = external hidden global i8*
 @__baz = external hidden global i8*
 
+; rdar://8819685
 define i8* @_foo() {
 entry:
-; CHECK: foo:
+; CHECK-LABEL: foo:
 
 	%size = alloca i32, align 4
 	%0 = load i8** @__bar, align 4
@@ -40,3 +42,102 @@ bb3:
 
 declare noalias i8* @strdup(i8* nocapture) nounwind
 declare i32 @_called_func(i8*, i32*) nounwind
+
+; Variable ending up at unaligned offset from sp (i.e. not a multiple of 4)
+define void @test_local_var_addr() {
+; CHECK-LABEL: test_local_var_addr:
+
+  %addr1 = alloca i8
+  %addr2 = alloca i8
+
+; CHECK: mov r0, sp
+; CHECK: adds r0, #{{[0-9]+}}
+; CHECK: blx
+  call void @take_ptr(i8* %addr1)
+
+; CHECK: mov r0, sp
+; CHECK: adds r0, #{{[0-9]+}}
+; CHECK: blx
+  call void @take_ptr(i8* %addr2)
+
+  ret void
+}
+
+; Simple variable ending up *at* sp.
+define void @test_simple_var() {
+; CHECK-LABEL: test_simple_var:
+
+  %addr32 = alloca i32
+  %addr8 = bitcast i32* %addr32 to i8*
+
+; CHECK: mov r0, sp
+; CHECK-NOT: adds r0
+; CHECK: blx
+  call void @take_ptr(i8* %addr8)
+  ret void
+}
+
+; Simple variable ending up at aligned offset from sp.
+define void @test_local_var_addr_aligned() {
+; CHECK-LABEL: test_local_var_addr_aligned:
+
+  %addr1.32 = alloca i32
+  %addr1 = bitcast i32* %addr1.32 to i8*
+  %addr2.32 = alloca i32
+  %addr2 = bitcast i32* %addr2.32 to i8*
+
+; CHECK: add r0, sp, #{{[0-9]+}}
+; CHECK: blx
+  call void @take_ptr(i8* %addr1)
+
+; CHECK: mov r0, sp
+; CHECK-NOT: add r0
+; CHECK: blx
+  call void @take_ptr(i8* %addr2)
+
+  ret void
+}
+
+; Simple variable ending up at aligned offset from sp.
+define void @test_local_var_big_offset() {
+; CHECK-LABEL: test_local_var_big_offset:
+  %addr1.32 = alloca i32, i32 257
+  %addr1 = bitcast i32* %addr1.32 to i8*
+  %addr2.32 = alloca i32, i32 257
+
+; CHECK: add [[RTMP:r[0-9]+]], sp, #1020
+; CHECK: adds [[RTMP]], #8
+; CHECK: blx
+  call void @take_ptr(i8* %addr1)
+
+  ret void
+}
+
+; Max range addressable with tADDrSPi
+define void @test_local_var_offset_1020() {
+; CHECK-LABEL: test_local_var_offset_1020
+  %addr1 = alloca i8, i32 4
+  %addr2 = alloca i8, i32 1020
+
+; CHECK: add r0, sp, #1020
+; CHECK-NEXT: blx
+  call void @take_ptr(i8* %addr1)
+
+  ret void
+}
+
+; Max range addressable with tADDrSPi + tADDi8
+define void @test_local_var_offset_1275() {
+; CHECK-LABEL: test_local_var_offset_1275
+  %addr1 = alloca i8, i32 1
+  %addr2 = alloca i8, i32 1275
+
+; CHECK: add r0, sp, #1020
+; CHECK: adds r0, #255
+; CHECK-NEXT: blx
+  call void @take_ptr(i8* %addr1)
+
+  ret void
+}
+
+declare void @take_ptr(i8*)
diff --git a/test/CodeGen/ARM/thumb1_return_sequence.ll b/test/CodeGen/ARM/thumb1_return_sequence.ll
new file mode 100644
index 0000000..318e6e4
--- /dev/null
+++ b/test/CodeGen/ARM/thumb1_return_sequence.ll
@@ -0,0 +1,217 @@
+; RUN: llc -mtriple=thumbv4t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-V4T
+; RUN: llc -mtriple=thumbv5t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-V5T
+
+; CHECK-V4T-LABEL: clobberframe
+; CHECK-V5T-LABEL: clobberframe
+define <4 x i32> @clobberframe() #0 {
+entry:
+; Prologue
+; --------
+; CHECK-V4T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+; CHECK-V4T:    sub sp,
+; CHECK-V5T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+
+  %b = alloca <4 x i32>, align 16
+  %a = alloca <4 x i32>, align 16
+  store <4 x i32> <i32 42, i32 42, i32 42, i32 42>, <4 x i32>* %b, align 16
+  store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a, align 16
+  %0 = load <4 x i32>* %a, align 16
+  ret <4 x i32> %0
+
+; Epilogue
+; --------
+; CHECK-V4T:         add sp,
+; CHECK-V4T-NEXT:    pop {[[SAVED]]}
+; CHECK-V4T-NEXT:    mov r12, r3
+; CHECK-V4T-NEXT:    pop {r3}
+; CHECK-V4T-NEXT:    mov lr, r3
+; CHECK-V4T-NEXT:    mov r3, r12
+; CHECK-V4T:         bx  lr
+; CHECK-V5T:         pop {[[SAVED]], pc}
+}
+
+; CHECK-V4T-LABEL: clobbervariadicframe
+; CHECK-V5T-LABEL: clobbervariadicframe
+define <4 x i32> @clobbervariadicframe(i32 %i, ...) #0 {
+entry:
+; Prologue
+; --------
+; CHECK-V4T:    sub sp,
+; CHECK-V4T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+; CHECK-V5T:    sub sp,
+; CHECK-V5T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+
+  %b = alloca <4 x i32>, align 16
+  %a = alloca <4 x i32>, align 16
+  store <4 x i32> <i32 42, i32 42, i32 42, i32 42>, <4 x i32>* %b, align 16
+  store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a, align 16
+  %0 = load <4 x i32>* %a, align 16
+  call void @llvm.va_start(i8* null)
+  ret <4 x i32> %0
+
+; Epilogue
+; --------
+; CHECK-V4T:         pop {[[SAVED]]}
+; CHECK-V4T-NEXT:    mov r12, r3
+; CHECK-V4T-NEXT:    pop {r3}
+; CHECK-V4T-NEXT:    add sp,
+; CHECK-V4T-NEXT:    mov lr, r3
+; CHECK-V4T-NEXT:    mov r3, r12
+; CHECK-V4T:         bx  lr
+; CHECK-V5T:         add sp,
+; CHECK-V5T-NEXT:    pop {[[SAVED]]}
+; CHECK-V5T-NEXT:    mov r12, r3
+; CHECK-V5T-NEXT:    pop {r3}
+; CHECK-V5T-NEXT:    add sp,
+; CHECK-V5T-NEXT:    mov lr, r3
+; CHECK-V5T-NEXT:    mov r3, r12
+; CHECK-V5T-NEXT:    bx lr
+}
+
+; CHECK-V4T-LABEL: simpleframe
+; CHECK-V5T-LABEL: simpleframe
+define i32 @simpleframe() #0 {
+entry:
+; Prologue
+; --------
+; CHECK-V4T:    push    {[[SAVED:(r[4567](, )?)+]], lr}
+; CHECK-V5T:    push    {[[SAVED:(r[4567](, )?)+]], lr}
+
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  %d = alloca i32, align 4
+  store i32 1, i32* %a, align 4
+  store i32 2, i32* %b, align 4
+  store i32 3, i32* %c, align 4
+  store i32 4, i32* %d, align 4
+  %0 = load i32* %a, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %a, align 4
+  %1 = load i32* %b, align 4
+  %inc1 = add nsw i32 %1, 1
+  store i32 %inc1, i32* %b, align 4
+  %2 = load i32* %c, align 4
+  %inc2 = add nsw i32 %2, 1
+  store i32 %inc2, i32* %c, align 4
+  %3 = load i32* %d, align 4
+  %inc3 = add nsw i32 %3, 1
+  store i32 %inc3, i32* %d, align 4
+  %4 = load i32* %a, align 4
+  %5 = load i32* %b, align 4
+  %add = add nsw i32 %4, %5
+  %6 = load i32* %c, align 4
+  %add4 = add nsw i32 %add, %6
+  %7 = load i32* %d, align 4
+  %add5 = add nsw i32 %add4, %7
+  ret i32 %add5
+
+; Epilogue
+; --------
+; CHECK-V4T:    pop {[[SAVED]]}
+; CHECK-V4T:    pop {r3}
+; CHECK-V4T:    bx r3
+; CHECK-V5T:    pop {[[SAVED]], pc}
+}
+
+; CHECK-V4T-LABEL: simplevariadicframe
+; CHECK-V5T-LABEL: simplevariadicframe
+define i32 @simplevariadicframe(i32 %i, ...) #0 {
+entry:
+; Prologue
+; --------
+; CHECK-V4T:    sub sp,
+; CHECK-V4T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+; CHECK-V4T:    sub sp,
+; CHECK-V5T:    sub sp,
+; CHECK-V5T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+; CHECK-V5T:    sub sp,
+
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  %d = alloca i32, align 4
+  store i32 1, i32* %a, align 4
+  store i32 2, i32* %b, align 4
+  store i32 3, i32* %c, align 4
+  store i32 4, i32* %d, align 4
+  %0 = load i32* %a, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %a, align 4
+  %1 = load i32* %b, align 4
+  %inc1 = add nsw i32 %1, 1
+  store i32 %inc1, i32* %b, align 4
+  %2 = load i32* %c, align 4
+  %inc2 = add nsw i32 %2, 1
+  store i32 %inc2, i32* %c, align 4
+  %3 = load i32* %d, align 4
+  %inc3 = add nsw i32 %3, 1
+  store i32 %inc3, i32* %d, align 4
+  %4 = load i32* %a, align 4
+  %5 = load i32* %b, align 4
+  %add = add nsw i32 %4, %5
+  %6 = load i32* %c, align 4
+  %add4 = add nsw i32 %add, %6
+  %7 = load i32* %d, align 4
+  %add5 = add nsw i32 %add4, %7
+  %add6 = add nsw i32 %add5, %i
+  call void @llvm.va_start(i8* null)
+  ret i32 %add6
+
+; Epilogue
+; --------
+; CHECK-V4T:         add sp,
+; CHECK-V4T-NEXT:    pop {[[SAVED]]}
+; CHECK-V4T-NEXT:    pop {r3}
+; CHECK-V4T-NEXT:    add sp,
+; CHECK-V4T-NEXT:    bx r3
+; CHECK-V5T:         add sp,
+; CHECK-V5T-NEXT:    pop {[[SAVED]]}
+; CHECK-V5T-NEXT:    pop {r3}
+; CHECK-V5T-NEXT:    add sp,
+; CHECK-V5T-NEXT:    bx r3
+}
+
+; CHECK-V4T-LABEL: noframe
+; CHECK-V5T-LABEL: noframe
+define i32 @noframe() #0 {
+entry:
+; Prologue
+; --------
+; CHECK-V4T-NOT: push
+; CHECK-V5T-NOT: push
+    ret i32 0;
+; Epilogue
+; --------
+; CHECK-V4T-NOT: pop
+; CHECK-V5T-NOT: pop
+; CHECK-V4T:    bx  lr
+; CHECK-V5T:    bx  lr
+}
+
+; CHECK-V4T-LABEL: novariadicframe
+; CHECK-V5T-LABEL: novariadicframe
+define i32 @novariadicframe(i32 %i, ...) #0 {
+entry:
+; Prologue
+; --------
+; CHECK-V4T:    sub sp,
+; CHECK-V4T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+; CHECK-V5T:    sub sp,
+; CHECK-V5T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+
+  call void @llvm.va_start(i8* null)
+  ret i32 %i;
+; Epilogue
+; --------
+; CHECK-V4T:         pop {[[SAVED]]}
+; CHECK-V4T-NEXT:    pop {r3}
+; CHECK-V4T-NEXT:    add sp,
+; CHECK-V4T-NEXT:    bx r3
+; CHECK-V5T:         pop {[[SAVED]]}
+; CHECK-V5T-NEXT:    pop {r3}
+; CHECK-V5T-NEXT:    add sp,
+; CHECK-V5T-NEXT:    bx r3
+}
+
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/ARM/thumb2-it-block.ll b/test/CodeGen/ARM/thumb2-it-block.ll
index c5e699c..2675a73 100644
--- a/test/CodeGen/ARM/thumb2-it-block.ll
+++ b/test/CodeGen/ARM/thumb2-it-block.ll
@@ -1,15 +1,9 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck -check-prefix CHECK-V7 %s
+; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s -check-prefix CHECK-V8
 ; PR11107
 
 define i32 @test(i32 %a, i32 %b) {
 entry:
-; CHECK:        cmp
-; CHECK-NEXT:   it    mi
-; CHECK-NEXT:   rsb{{s?}}mi
-; CHECK-NEXT:   cmp
-; CHECK-NEXT:   it    mi
-; CHECK-NEXT:   rsb{{s?}}mi
  %cmp1 = icmp slt i32 %a, 0
  %sub1 = sub nsw i32 0, %a
  %abs1 = select i1 %cmp1, i32 %sub1, i32 %a
@@ -19,3 +13,18 @@ entry:
  %add = add nsw i32 %abs1, %abs2
  ret i32 %add
 }
+
+; CHECK-V7:        cmp
+; CHECK-V7-NEXT:   it    mi
+; CHECK-V7-NEXT:   rsbmi
+; CHECK-V7-NEXT:   cmp
+; CHECK-V7-NEXT:   it    mi
+; CHECK-V7-NEXT:   rsbmi
+
+; CHECK-V8:        cmp
+; CHECK-V8-NEXT:   bpl
+; CHECK-V8:        rsbs
+; CHECK-V8:        cmp
+; CHECK-V8-NEXT:   bpl
+; CHECK-V8:        rsbs
+
diff --git a/test/CodeGen/ARM/thumb2-size-opt.ll b/test/CodeGen/ARM/thumb2-size-opt.ll
new file mode 100644
index 0000000..0084a45
--- /dev/null
+++ b/test/CodeGen/ARM/thumb2-size-opt.ll
@@ -0,0 +1,84 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabihf -o - -show-mc-encoding -t2-reduce-limit=0 -t2-reduce-limit2=0 %s | FileCheck %s
+; RUN: llc -mtriple=thumbv7-linux-gnueabihf -o - -show-mc-encoding %s | FileCheck %s --check-prefix=CHECK-OPT
+
+define i32 @and(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: and:
+; CHECK: and.w r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: ands r{{[0-7]}}, r{{[0-7]}} @ encoding: [{{0x..,0x..}}]
+entry:
+  %and = and i32 %b, %a
+  ret i32 %and
+}
+
+define i32 @asr-imm(i32 %a) nounwind readnone {
+; CHECK-LABEL: "asr-imm":
+; CHECK: asr.w r{{[0-9]+}}, r{{[0-9]+}}, #13 @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: asrs r{{[0-7]}}, r{{[0-7]}}, #13 @ encoding: [{{0x..,0x..}}]
+entry:
+  %shr = ashr i32 %a, 13
+  ret i32 %shr
+}
+
+define i32 @asr-reg(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: "asr-reg":
+; CHECK: asr.w r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: asrs r{{[0-7]}}, r{{[0-7]}} @ encoding: [{{0x..,0x..}}]
+entry:
+  %shr = ashr i32 %a, %b
+  ret i32 %shr
+}
+
+define i32 @bic(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: bic:
+; CHECK: bic.w r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: bics r{{[0-7]}}, r{{[0-7]}} @ encoding: [{{0x..,0x..}}]
+entry:
+  %neg = xor i32 %b, -1
+  %and = and i32 %neg, %a
+  ret i32 %and
+}
+
+define i32 @eor(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: eor:
+; CHECK: eor.w r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: eors r{{[0-7]}}, r{{[0-7]}} @ encoding: [{{0x..,0x..}}]
+entry:
+  %eor = xor i32 %a, %b
+  ret i32 %eor
+}
+
+define i32 @lsl-imm(i32 %a) nounwind readnone {
+; CHECK-LABEL: "lsl-imm":
+; CHECK: lsl.w r{{[0-9]+}}, r{{[0-9]+}}, #13 @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: lsls r{{[0-7]}}, r{{[0-7]}}, #13  @ encoding: [{{0x..,0x..}}]
+entry:
+  %shl = shl i32 %a, 13
+  ret i32 %shl
+}
+
+define i32 @lsl-reg(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: "lsl-reg":
+; CHECK: lsl.w r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: lsls r{{[0-7]}}, r{{[0-7]}}  @ encoding: [{{0x..,0x..}}]
+entry:
+  %shl = shl i32 %a, %b
+  ret i32 %shl
+}
+
+define i32 @lsr-imm(i32 %a) nounwind readnone {
+; CHECK-LABEL: "lsr-imm":
+; CHECK: lsr.w r{{[0-9]+}}, r{{[0-9]+}}, #13 @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: lsrs r{{[0-7]}}, r{{[0-7]}}, #13  @ encoding: [{{0x..,0x..}}]
+entry:
+  %shr = lshr i32 %a, 13
+  ret i32 %shr
+}
+
+define i32 @lsr-reg(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: "lsr-reg":
+; CHECK: lsr.w r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: lsrs r{{[0-7]}}, r{{[0-7]}}  @ encoding: [{{0x..,0x..}}]
+entry:
+  %shr = lshr i32 %a, %b
+  ret i32 %shr
+}
diff --git a/test/CodeGen/ARM/vararg_no_start.ll b/test/CodeGen/ARM/vararg_no_start.ll
new file mode 100644
index 0000000..f9c8c1b
--- /dev/null
+++ b/test/CodeGen/ARM/vararg_no_start.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=arm-darwin < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=arm-darwin < %s | FileCheck %s
+
+define void @foo(i8*, ...) {
+  ret void
+}
+; CHECK-LABEL: {{^_?}}foo:
+; CHECK-NOT: str
+; CHECK: {{bx lr|mov pc, lr}}
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll b/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
index 19d6cbe..148a79d 100644
--- a/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
+++ b/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
@@ -22,9 +22,9 @@ define void @varargs_func(i32 %arg1, ...) {
 ; Reserve space for the varargs save area.  This currently reserves
 ; more than enough (16 bytes rather than the 12 bytes needed).
 ; CHECK: sub sp, sp, #16
-; CHECK: push {lr}
+; CHECK: push {r11, lr}
 ; Align the stack pointer to a multiple of 16.
-; CHECK: sub sp, sp, #12
+; CHECK: sub sp, sp, #8
 ; Calculate the address of the varargs save area and save varargs
 ; arguments into it.
 ; CHECK-NEXT: add r0, sp, #20
diff --git a/test/CodeGen/ARM/vargs_align.ll b/test/CodeGen/ARM/vargs_align.ll
index e390cf0..3abb57e 100644
--- a/test/CodeGen/ARM/vargs_align.ll
+++ b/test/CodeGen/ARM/vargs_align.ll
@@ -10,6 +10,7 @@ entry:
 	store i32 0, i32* %tmp
 	%tmp1 = load i32* %tmp		; <i32> [#uses=1]
 	store i32 %tmp1, i32* %retval
+	call void @llvm.va_start(i8* null)
 	br label %return
 
 return:		; preds = %entry
@@ -20,3 +21,5 @@ return:		; preds = %entry
 ; OABI: add sp, sp, #12
 ; OABI: add sp, sp, #12
 }
+
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/ARM/vector-promotion.ll b/test/CodeGen/ARM/vector-promotion.ll
new file mode 100644
index 0000000..42ceb60
--- /dev/null
+++ b/test/CodeGen/ARM/vector-promotion.ll
@@ -0,0 +1,403 @@
+; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
+; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s
+; RUN: llc -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon | FileCheck --check-prefix=ASM %s
+
+; IR-BOTH-LABEL: @simpleOneInstructionPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 undef, i32 1>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR]], i32 1
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest
+; IR-BOTH-NEXT: ret
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
+; ASM-LABEL: simpleOneInstructionPromotion:
+; ASM: vldr [[LOAD:d[0-9]+]], [r0]
+; ASM-NEXT: vorr.i32 [[LOAD]], #0x1
+; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1:32]
+; ASM-NEXT: bx
+define void @simpleOneInstructionPromotion(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @unsupportedInstructionForPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0
+; IR-BOTH-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp eq i32 [[EXTRACT]], %in2
+; IR-BOTH-NEXT: store i1 [[CMP]], i1* %dest
+; IR-BOTH-NEXT: ret
+;
+; ASM-LABEL: unsupportedInstructionForPromotion:
+; ASM: vldr [[LOAD:d[0-9]+]], [r0]
+; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]]
+; ASM: bx
+define void @unsupportedInstructionForPromotion(<2 x i32>* %addr1, i32 %in2, i1* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out = icmp eq i32 %extract, %in2
+  store i1 %out, i1* %dest, align 4
+  ret void
+}
+
+
+; IR-BOTH-LABEL: @unsupportedChainInDifferentBBs
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0
+; IR-BOTH-NEXT: br i1 %bool, label %bb2, label %end
+; BB2
+; IR-BOTH: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
+; IR-BOTH-NEXT: store i32 [[OR]], i32* %dest, align 4
+; IR-BOTH: ret
+;
+; ASM-LABEL: unsupportedChainInDifferentBBs:
+; ASM: vldrne [[LOAD:d[0-9]+]], [r0]
+; ASM: vmovne.32 {{r[0-9]+}}, [[LOAD]]
+; ASM: bx
+define void @unsupportedChainInDifferentBBs(<2 x i32>* %addr1, i32* %dest, i1 %bool) {
+bb1:
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  br i1 %bool, label %bb2, label %end
+bb2: 
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  br label %end
+end:
+  ret void
+}
+
+; IR-LABEL: @chainOfInstructionsToPromote
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR1]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR3:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR2]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR4:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR3]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR5:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR4]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR6:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR5]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR7:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR6]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR7]], i32 0
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest
+; IR-BOTH-NEXT: ret
+;
+; ASM-LABEL: chainOfInstructionsToPromote:
+; ASM: vldr [[LOAD:d[0-9]+]], [r0]
+; ASM-NOT: vmov.32 {{r[0-9]+}}, [[LOAD]]
+; ASM: bx
+define void @chainOfInstructionsToPromote(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out1 = or i32 %extract, 1
+  %out2 = or i32 %out1, 1
+  %out3 = or i32 %out2, 1
+  %out4 = or i32 %out3, 1
+  %out5 = or i32 %out4, 1
+  %out6 = or i32 %out5, 1
+  %out7 = or i32 %out6, 1
+  store i32 %out7, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @unsupportedMultiUses
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-BOTH-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
+; IR-BOTH-NEXT: store i32 [[OR]], i32* %dest
+; IR-BOTH-NEXT: ret i32 [[OR]]
+;
+; ASM-LABEL: unsupportedMultiUses:
+; ASM: vldr [[LOAD:d[0-9]+]], [r0]
+; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]]
+; ASM: bx
+define i32 @unsupportedMultiUses(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret i32 %out
+}
+
+; Check that we promote we a splat constant when this is a division.
+; The NORMAL mode does not promote anything as divisions are not legal.
+; IR-BOTH-LABEL: @udivCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 [[EXTRACT]], 7
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = udiv <2 x i32> [[LOAD]], <i32 7, i32 7>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret
+define void @udivCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = udiv i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @uremCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = urem i32 [[EXTRACT]], 7
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = urem <2 x i32> [[LOAD]], <i32 7, i32 7>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret 
+define void @uremCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = urem i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @sdivCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sdiv i32 [[EXTRACT]], 7
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = sdiv <2 x i32> [[LOAD]], <i32 7, i32 7>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret 
+define void @sdivCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = sdiv i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @sremCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 [[EXTRACT]], 7
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = srem <2 x i32> [[LOAD]], <i32 7, i32 7>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret 
+define void @sremCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = srem i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @fdivCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fdiv float [[EXTRACT]], 7.0
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fdiv <2 x float> [[LOAD]], <float 7.000000e+00, float 7.000000e+00>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @fdivCase(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8   
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = fdiv float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @fremCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem float [[EXTRACT]], 7.0
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem <2 x float> [[LOAD]], <float 7.000000e+00, float 7.000000e+00>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @fremCase(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8   
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+; IR-BOTH-LABEL: @undefDivCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 7, [[EXTRACT]]
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret
+define void @undefDivCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = udiv i32 7, %extract
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+; IR-BOTH-LABEL: @undefRemCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 7, [[EXTRACT]]
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret
+define void @undefRemCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = srem i32 7, %extract
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; IR-BOTH-LABEL: @undefConstantFRemCaseWithFastMath
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float [[EXTRACT]], 7.0
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> [[LOAD]], <float undef, float 7.000000e+00>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @undefConstantFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8   
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem nnan float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; IR-BOTH-LABEL: @undefVectorFRemCaseWithFastMath
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float 7.000000e+00, [[EXTRACT]]
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> <float undef, float 7.000000e+00>, [[LOAD]]
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @undefVectorFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8   
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem nnan float 7.0, %extract
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we are able to promote floating point value.
+; This requires the STRESS mode, as floating point value are
+; not promote on armv7.
+; IR-BOTH-LABEL: @simpleOneInstructionPromotionFloat
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version: 
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fadd float [[EXTRACT]], 1.0
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fadd <2 x float> [[LOAD]], <float undef, float 1.000000e+00>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @simpleOneInstructionPromotionFloat(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = fadd float %extract, 1.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we correctly use a splat constant when we cannot
+; determine at compile time the index of the extract.
+; This requires the STRESS modes, as variable index are expensive
+; to lower.
+; IR-BOTH-LABEL: @simpleOneInstructionPromotionVariableIdx
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 %idx
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
+; Vector version:
+; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 1>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[OR]], i32 %idx
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret
+define void @simpleOneInstructionPromotionVariableIdx(<2 x i32>* %addr1, i32* %dest, i32 %idx) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 %idx
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Check a vector with more than 2 elements.
+; This requires the STRESS mode because currently 'or v8i8' is not marked
+; as legal or custom, althought the actual assembly is better if we were
+; promoting it.
+; IR-BOTH-LABEL: @simpleOneInstructionPromotion8x8
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <8 x i8>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i8 [[EXTRACT]], 1
+; Vector version:  
+; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <8 x i8> [[LOAD]], <i8 undef, i8 1, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[OR]], i32 1
+;
+; IR-BOTH-NEXT: store i8 [[RES]], i8* %dest
+; IR-BOTH-NEXT: ret
+define void @simpleOneInstructionPromotion8x8(<8 x i8>* %addr1, i8* %dest) {
+  %in1 = load <8 x i8>* %addr1, align 8
+  %extract = extractelement <8 x i8> %in1, i32 1
+  %out = or i8 %extract, 1
+  store i8 %out, i8* %dest, align 4
+  ret void
+}
+
+; Check that we optimized the sequence correctly when it can be
+; lowered on a Q register.
+; IR-BOTH-LABEL: @simpleOneInstructionPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>* %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[LOAD]], <i32 undef, i32 1, i32 undef, i32 undef>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[VECTOR_OR]], i32 1
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest
+; IR-BOTH-NEXT: ret
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
+; ASM-LABEL: simpleOneInstructionPromotion4x32:
+; ASM: vld1.64 {[[LOAD:d[0-9]+]], d{{[0-9]+}}}, [r0]
+; The Q register used here must be [[LOAD]] / 2, but we cannot express that.
+; ASM-NEXT: vorr.i32 q{{[[0-9]+}}, #0x1
+; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1]
+; ASM-NEXT: bx
+define void @simpleOneInstructionPromotion4x32(<4 x i32>* %addr1, i32* %dest) {
+  %in1 = load <4 x i32>* %addr1, align 8
+  %extract = extractelement <4 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM/vector-spilling.ll b/test/CodeGen/ARM/vector-spilling.ll
new file mode 100644
index 0000000..746c6df
--- /dev/null
+++ b/test/CodeGen/ARM/vector-spilling.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -arm-atomic-cfg-tidy=0 -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
+
+; This test will generate spills/fills using vldmia instructions that access 24 bytes of memory.
+; Check that we don't crash when we generate these instructions on Cortex-A9.
+
+; CHECK: test:
+; CHECK: vstmia
+; CHECK: vldmia
+define void @test(<8 x i64>* %src) #0 {
+entry:
+  %0 = getelementptr inbounds <8 x i64>* %src, i32 0
+  %1 = load <8 x i64>* %0, align 8
+
+  %2 = getelementptr inbounds <8 x i64>* %src, i32 1
+  %3 = load <8 x i64>* %2, align 8
+
+  %4 = getelementptr inbounds <8 x i64>* %src, i32 2
+  %5 = load <8 x i64>* %4, align 8
+
+  %6 = getelementptr inbounds <8 x i64>* %src, i32 3
+  %7 = load <8 x i64>* %6, align 8
+
+  %8 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %9 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+
+  tail call void(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)* @foo(<8 x i64> %1, <8 x i64> %3, <8 x i64> %5, <8 x i64> %7, <8 x i64> %8, <8 x i64> %9)
+  ret void
+}
+
+declare void @foo(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)
+
+attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/ARM/vfp-regs-dwarf.ll b/test/CodeGen/ARM/vfp-regs-dwarf.ll
index 4976729..f83adf9 100644
--- a/test/CodeGen/ARM/vfp-regs-dwarf.ll
+++ b/test/CodeGen/ARM/vfp-regs-dwarf.ll
@@ -31,14 +31,14 @@ define void @stack_offsets() {
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/Users/tim/llvm/build/tmp.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/tim/llvm/build/tmp.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"tmp.c", metadata !"/Users/tim/llvm/build"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @stack_offsets, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/Users/tim/llvm/build/tmp.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00bar\00bar\00\001\000\001\000\006\000\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @stack_offsets, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/Users/tim/llvm/build/tmp.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 
diff --git a/test/CodeGen/ARM/vldm-sched-a9.ll b/test/CodeGen/ARM/vldm-sched-a9.ll
index f2e5eb9..e5e7bc0 100644
--- a/test/CodeGen/ARM/vldm-sched-a9.ll
+++ b/test/CodeGen/ARM/vldm-sched-a9.ll
@@ -2,8 +2,8 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
 
-; This test will generate spills/fills using vldmia instructions that access 64 bytes of memory.
-; Check that we don't crash when we generate these instructions on Cortex-A9.
+; This test used to test vector spilling using vstmia/vldmia instructions, but
+; the changes for PR:18825 prevent that spilling.
 
 ; CHECK: test:
 ; CHECK: vstmia
diff --git a/test/CodeGen/ARM/vminmaxnm.ll b/test/CodeGen/ARM/vminmaxnm.ll
index f6ce64c..39289a0 100644
--- a/test/CodeGen/ARM/vminmaxnm.ll
+++ b/test/CodeGen/ARM/vminmaxnm.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-FAST
 
 define <4 x float> @vmaxnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
-; CHECK: vmaxnmq
+; CHECK-LABEL: vmaxnmq:
 ; CHECK: vmaxnm.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
   %tmp1 = load <4 x float>* %A
   %tmp2 = load <4 x float>* %B
@@ -11,7 +11,7 @@ define <4 x float> @vmaxnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
 }
 
 define <2 x float> @vmaxnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
-; CHECK: vmaxnmd
+; CHECK-LABEL: vmaxnmd:
 ; CHECK: vmaxnm.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
   %tmp1 = load <2 x float>* %A
   %tmp2 = load <2 x float>* %B
@@ -20,7 +20,7 @@ define <2 x float> @vmaxnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
 }
 
 define <4 x float> @vminnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
-; CHECK: vminnmq
+; CHECK-LABEL: vminnmq:
 ; CHECK: vminnm.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
   %tmp1 = load <4 x float>* %A
   %tmp2 = load <4 x float>* %B
@@ -29,7 +29,7 @@ define <4 x float> @vminnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
 }
 
 define <2 x float> @vminnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
-; CHECK: vminnmd
+; CHECK-LABEL: vminnmd:
 ; CHECK: vminnm.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
   %tmp1 = load <2 x float>* %A
   %tmp2 = load <2 x float>* %B
@@ -38,49 +38,93 @@ define <2 x float> @vminnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
 }
 
 define float @fp-armv8_vminnm_o(float %a, float %b) {
-; CHECK-FAST: fp-armv8_vminnm_o
+; CHECK-FAST-LABEL: "fp-armv8_vminnm_o":
 ; CHECK-FAST-NOT: vcmp
 ; CHECK-FAST: vminnm.f32
-; CHECK: fp-armv8_vminnm_o
+; CHECK-LABEL: "fp-armv8_vminnm_o":
 ; CHECK-NOT: vminnm.f32
   %cmp = fcmp olt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
+define float @fp-armv8_vminnm_o_rev(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vminnm_o_rev":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vminnm.f32
+; CHECK-LABEL: "fp-armv8_vminnm_o_rev":
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp ogt float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
 define float @fp-armv8_vminnm_u(float %a, float %b) {
-; CHECK-FAST: fp-armv8_vminnm_u
+; CHECK-FAST-LABEL: "fp-armv8_vminnm_u":
 ; CHECK-FAST-NOT: vcmp
 ; CHECK-FAST: vminnm.f32
-; CHECK: fp-armv8_vminnm_u
+; CHECK-LABEL: "fp-armv8_vminnm_u":
 ; CHECK-NOT: vminnm.f32
   %cmp = fcmp ult float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
+define float @fp-armv8_vminnm_u_rev(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vminnm_u_rev":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vminnm.f32
+; CHECK-LABEL: "fp-armv8_vminnm_u_rev":
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp ugt float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
 define float @fp-armv8_vmaxnm_o(float %a, float %b) {
-; CHECK-FAST: fp-armv8_vmaxnm_o
+; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_o":
 ; CHECK-FAST-NOT: vcmp
 ; CHECK-FAST: vmaxnm.f32
-; CHECK: fp-armv8_vmaxnm_o
+; CHECK-LABEL: "fp-armv8_vmaxnm_o":
 ; CHECK-NOT: vmaxnm.f32
   %cmp = fcmp ogt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
+define float @fp-armv8_vmaxnm_o_rev(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_o_rev":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vmaxnm.f32
+; CHECK-LABEL: "fp-armv8_vmaxnm_o_rev":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp olt float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
 define float @fp-armv8_vmaxnm_u(float %a, float %b) {
-; CHECK-FAST: fp-armv8_vmaxnm_u
+; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_u":
 ; CHECK-FAST-NOT: vcmp
 ; CHECK-FAST: vmaxnm.f32
-; CHECK: fp-armv8_vmaxnm_u
+; CHECK-LABEL: "fp-armv8_vmaxnm_u":
 ; CHECK-NOT: vmaxnm.f32
   %cmp = fcmp ugt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
+define float @fp-armv8_vmaxnm_u_rev(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_u_rev":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vmaxnm.f32
+; CHECK-LABEL: "fp-armv8_vmaxnm_u_rev":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp ult float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
 
 declare <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll b/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll
new file mode 100644
index 0000000..7ecd252
--- /dev/null
+++ b/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mcpu=cortex-a9 -O1 -filetype=obj %s -o - | llvm-objdump -arch thumb -mcpu=cortex-a9 -d - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7--linux-gnueabi"
+
+declare i8* @llvm.returnaddress(i32)
+
+define i32* @wrong-t2stmia-size-reduction(i32* %addr, i32 %val0) minsize {
+  store i32 %val0, i32* %addr
+  %addr1 = getelementptr i32* %addr, i32 1
+  %lr = call i8* @llvm.returnaddress(i32 0)
+  %lr32 = ptrtoint i8* %lr to i32
+  store i32 %lr32, i32* %addr1
+  %addr2 = getelementptr i32* %addr1, i32 1
+  ret i32* %addr2
+}
+
+; Check that stm writes two registers.  The bug caused one of registers (LR,
+; which invalid for Thumb1 form of STMIA instruction) to be dropped.
+; CHECK: stm{{[^,]*}}, {{{.*,.*}}}
diff --git a/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll b/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
index 3f17ce1..eaaeb37 100644
--- a/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
+++ b/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
@@ -30,20 +30,20 @@
 	%"struct.qdesigner_internal::GridLayout" = type { %"struct.qdesigner_internal::Layout", %"struct.QPair<int,int>", %"struct.qdesigner_internal::Grid"* }
 	%"struct.qdesigner_internal::Layout" = type { %struct.QObject, %"struct.QList<QAbstractExtensionFactory*>", %struct.QWidget*, %"struct.QHash<QString,QList<QAbstractExtensionFactory*> >", %struct.QWidget*, %struct.QDesignerFormWindowInterface*, i8, %"struct.QPair<int,int>", %struct.QRect, i8 }
 
-@_ZL20__gthrw_pthread_oncePiPFvvE = alias weak i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
-@_ZL27__gthrw_pthread_getspecificj = alias weak i8* (i32)* @pthread_getspecific		; <i8* (i32)*> [#uses=0]
-@_ZL27__gthrw_pthread_setspecificjPKv = alias weak i32 (i32, i8*)* @pthread_setspecific		; <i32 (i32, i8*)*> [#uses=0]
-@_ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_ = alias weak i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)* @pthread_create		; <i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)*> [#uses=0]
-@_ZL22__gthrw_pthread_cancelm = alias weak i32 (i64)* @pthread_cancel		; <i32 (i64)*> [#uses=0]
-@_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t = alias weak i32 (%struct.pthread_mutex_t*)* @pthread_mutex_lock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
-@_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t = alias weak i32 (%struct.pthread_mutex_t*)* @pthread_mutex_trylock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
-@_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t = alias weak i32 (%struct.pthread_mutex_t*)* @pthread_mutex_unlock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
-@_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t = alias weak i32 (%struct.pthread_mutex_t*, %struct.Alignment*)* @pthread_mutex_init		; <i32 (%struct.pthread_mutex_t*, %struct.Alignment*)*> [#uses=0]
-@_ZL26__gthrw_pthread_key_createPjPFvPvE = alias weak i32 (i32*, void (i8*)*)* @pthread_key_create		; <i32 (i32*, void (i8*)*)*> [#uses=0]
-@_ZL26__gthrw_pthread_key_deletej = alias weak i32 (i32)* @pthread_key_delete		; <i32 (i32)*> [#uses=0]
-@_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t = alias weak i32 (%struct.Alignment*)* @pthread_mutexattr_init		; <i32 (%struct.Alignment*)*> [#uses=0]
-@_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti = alias weak i32 (%struct.Alignment*, i32)* @pthread_mutexattr_settype		; <i32 (%struct.Alignment*, i32)*> [#uses=0]
-@_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t = alias weak i32 (%struct.Alignment*)* @pthread_mutexattr_destroy		; <i32 (%struct.Alignment*)*> [#uses=0]
+@_ZL20__gthrw_pthread_oncePiPFvvE = weak alias i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
+@_ZL27__gthrw_pthread_getspecificj = weak alias i8* (i32)* @pthread_getspecific		; <i8* (i32)*> [#uses=0]
+@_ZL27__gthrw_pthread_setspecificjPKv = weak alias i32 (i32, i8*)* @pthread_setspecific		; <i32 (i32, i8*)*> [#uses=0]
+@_ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_ = weak alias i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)* @pthread_create		; <i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)*> [#uses=0]
+@_ZL22__gthrw_pthread_cancelm = weak alias i32 (i64)* @pthread_cancel		; <i32 (i64)*> [#uses=0]
+@_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*)* @pthread_mutex_lock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
+@_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*)* @pthread_mutex_trylock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
+@_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*)* @pthread_mutex_unlock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
+@_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t = weak alias i32 (%struct.pthread_mutex_t*, %struct.Alignment*)* @pthread_mutex_init		; <i32 (%struct.pthread_mutex_t*, %struct.Alignment*)*> [#uses=0]
+@_ZL26__gthrw_pthread_key_createPjPFvPvE = weak alias i32 (i32*, void (i8*)*)* @pthread_key_create		; <i32 (i32*, void (i8*)*)*> [#uses=0]
+@_ZL26__gthrw_pthread_key_deletej = weak alias i32 (i32)* @pthread_key_delete		; <i32 (i32)*> [#uses=0]
+@_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t = weak alias i32 (%struct.Alignment*)* @pthread_mutexattr_init		; <i32 (%struct.Alignment*)*> [#uses=0]
+@_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti = weak alias i32 (%struct.Alignment*, i32)* @pthread_mutexattr_settype		; <i32 (%struct.Alignment*, i32)*> [#uses=0]
+@_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t = weak alias i32 (%struct.Alignment*)* @pthread_mutexattr_destroy		; <i32 (%struct.Alignment*)*> [#uses=0]
 
 define void @_ZN18qdesigner_internal10GridLayout9buildGridEv(%"struct.qdesigner_internal::GridLayout"* %this) nounwind {
 entry:
diff --git a/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll b/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
index da26504..cd446d5 100644
--- a/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
+++ b/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
@@ -3,9 +3,6 @@
 ; the uses of a copy to a physical register without ignoring non-data
 ; dependence, PR10220.
 
-; The ARM backend can't handle i256 math at the moment.
-; XFAIL: arm
-
 define void @f(i256* nocapture %a, i256* nocapture %b, i256* nocapture %cc, i256* nocapture %dd) nounwind uwtable noinline ssp {
 entry:
   %c = load i256* %cc
diff --git a/test/CodeGen/Generic/PBQP.ll b/test/CodeGen/Generic/PBQP.ll
new file mode 100644
index 0000000..91fcfba
--- /dev/null
+++ b/test/CodeGen/Generic/PBQP.ll
@@ -0,0 +1,29 @@
+; RUN: llc -regalloc=pbqp < %s
+
+define i32 @foo() {
+entry:
+  %call = tail call i32 (...)* @baz()
+  %call1 = tail call i32 (...)* @baz()
+  %call2 = tail call i32 (...)* @baz()
+  %call3 = tail call i32 (...)* @baz()
+  %call4 = tail call i32 (...)* @baz()
+  %call5 = tail call i32 (...)* @baz()
+  %call6 = tail call i32 (...)* @baz()
+  %call7 = tail call i32 (...)* @baz()
+  %call8 = tail call i32 (...)* @baz()
+  %call9 = tail call i32 (...)* @baz()
+  %call10 = tail call i32 (...)* @baz()
+  %call11 = tail call i32 (...)* @baz()
+  %call12 = tail call i32 (...)* @baz()
+  %call13 = tail call i32 (...)* @baz()
+  %call14 = tail call i32 (...)* @baz()
+  %call15 = tail call i32 (...)* @baz()
+  %call16 = tail call i32 (...)* @baz()
+  %call17 = tail call i32 @bar(i32 %call, i32 %call1, i32 %call2, i32 %call3, i32 %call4, i32 %call5, i32 %call6, i32 %call7, i32 %call8, i32 %call9, i32 %call10, i32 %call11, i32 %call12, i32 %call13, i32 %call14, i32 %call15, i32 %call16)
+  ret i32 %call17
+}
+
+declare i32 @baz(...)
+
+declare i32 @bar(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
diff --git a/test/CodeGen/Generic/assume.ll b/test/CodeGen/Generic/assume.ll
new file mode 100644
index 0000000..bb045b3
--- /dev/null
+++ b/test/CodeGen/Generic/assume.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s
+
+define void @main() {
+        call void @llvm.assume(i1 1)
+        ret void
+}
+
+declare void @llvm.assume(i1) nounwind
+
diff --git a/test/CodeGen/Generic/dbg_value.ll b/test/CodeGen/Generic/dbg_value.ll
index 840eeb0..73e41c7 100644
--- a/test/CodeGen/Generic/dbg_value.ll
+++ b/test/CodeGen/Generic/dbg_value.ll
@@ -4,11 +4,11 @@
 %0 = type { i32, i32 }
 
 define void @t(%0*, i32, i32, i32, i32) nounwind {
-  tail call void @llvm.dbg.value(metadata !{%0* %0}, i64 0, metadata !0)
+  tail call void @llvm.dbg.value(metadata !{%0* %0}, i64 0, metadata !0, metadata !{metadata !"0x102"})
   unreachable
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 ; !0 should conform to the format of DIVariable.
-!0 = metadata !{i32 786689, null, metadata !"a", null, i32 0, null, i32 0, i32 0} ;
+!0 = metadata !{metadata !"0x101\00a\000\000", null, null, null} ; [ DW_TAG_arg_variable ]
diff --git a/test/CodeGen/Generic/empty-insertvalue.ll b/test/CodeGen/Generic/empty-insertvalue.ll
new file mode 100644
index 0000000..e4cc27c
--- /dev/null
+++ b/test/CodeGen/Generic/empty-insertvalue.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s
+
+define void @f() {
+entry:
+  %0 = insertvalue { [0 x { i8*, i8* }], [0 x { i8*, i64 }] } undef, [0 x { i8*, i8* }] undef, 0
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/cmp-not.ll b/test/CodeGen/Hexagon/cmp-not.ll
new file mode 100644
index 0000000..abcddc38
--- /dev/null
+++ b/test/CodeGen/Hexagon/cmp-not.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we generate matching compare insn.
+
+; Function Attrs: nounwind
+define i32 @neqi(i32 %argc) #0 {
+entry:
+  %p = alloca i8, align 1
+  %0 = tail call i1 @llvm.hexagon.C4.cmpneqi(i32 %argc, i32 512)
+  %conv = zext i1 %0 to i8
+  store volatile i8 %conv, i8* %p, align 1
+  %p.0.p.0. = load volatile i8* %p, align 1
+  %conv1 = zext i8 %p.0.p.0. to i32
+  ret i32 %conv1
+}
+; CHECK:	p{{[0-3]}}{{ *}} = !cmp.eq(r{{[0-9]+}}, ##512)
+
+; Function Attrs: nounwind readnone
+declare i1 @llvm.hexagon.C4.cmpneqi(i32, i32) #1
+
+; Function Attrs: nounwind
+define i32 @ngti(i32 %argc) #0 {
+entry:
+  %p = alloca i8, align 1
+  %0 = tail call i1 @llvm.hexagon.C4.cmpltei(i32 %argc, i32 4)
+  %conv = zext i1 %0 to i8
+  store volatile i8 %conv, i8* %p, align 1
+  %p.0.p.0. = load volatile i8* %p, align 1
+  %conv1 = zext i8 %p.0.p.0. to i32
+  ret i32 %conv1
+}
+; CHECK:	p{{[0-3]}}{{ *}} = !cmp.gt(r{{[0-9]+}}, #4)
+
+; Function Attrs: nounwind readnone
+declare i1 @llvm.hexagon.C4.cmpltei(i32, i32) #1
+
+; Function Attrs: nounwind
+define i32 @ngtui(i32 %argc) #0 {
+entry:
+  %p = alloca i8, align 1
+  %0 = tail call i1 @llvm.hexagon.C4.cmplteui(i32 %argc, i32 4)
+  %conv = zext i1 %0 to i8
+  store volatile i8 %conv, i8* %p, align 1
+  %p.0.p.0. = load volatile i8* %p, align 1
+  %conv1 = zext i8 %p.0.p.0. to i32
+  ret i32 %conv1
+}
+; CHECK: 	p{{[0-3]}}{{ *}} = !cmp.gtu(r{{[0-9]+}}, #4)
+
+; Function Attrs: nounwind readnone
+declare i1 @llvm.hexagon.C4.cmplteui(i32, i32) #1
diff --git a/test/CodeGen/Hexagon/ctor.ll b/test/CodeGen/Hexagon/ctor.ll
new file mode 100644
index 0000000..2e2fc51
--- /dev/null
+++ b/test/CodeGen/Hexagon/ctor.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=hexagon < %s  | FileCheck -check-prefix=INITARRAY %s
+; RUN: llc -march=hexagon < %s  -use-ctors | FileCheck -check-prefix=CTOR %s
+
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_P10066.ii, i8* null }]
+define internal void @_GLOBAL__sub_I_P10066.ii() {
+entry:
+  ret void
+}
+
+;CTOR: .section	.ctors
+;CTOR-NOT:  section	.init_array
+
+;INITARRAY: section	.init_array
+;INITARRAY-NOT: .section	.ctors
diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
index 9537489..f093dae 100644
--- a/test/CodeGen/Hexagon/hwloop-dbg.ll
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll
@@ -5,9 +5,9 @@ target triple = "hexagon"
 
 define void @foo(i32* nocapture %a, i32* nocapture %b) nounwind {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32* %a}, i64 0, metadata !13), !dbg !17
-  tail call void @llvm.dbg.value(metadata !{i32* %b}, i64 0, metadata !14), !dbg !18
-  tail call void @llvm.dbg.value(metadata !30, i64 0, metadata !15), !dbg !19
+  tail call void @llvm.dbg.value(metadata !{i32* %a}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !17
+  tail call void @llvm.dbg.value(metadata !{i32* %b}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata !30, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !19
   br label %for.body, !dbg !19
 
 for.body:                                         ; preds = %for.body, %entry
@@ -18,11 +18,11 @@ for.body:                                         ; preds = %for.body, %entry
   %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   %b.addr.01 = phi i32* [ %b, %entry ], [ %incdec.ptr, %for.body ]
   %incdec.ptr = getelementptr inbounds i32* %b.addr.01, i32 1, !dbg !21
-  tail call void @llvm.dbg.value(metadata !{i32* %incdec.ptr}, i64 0, metadata !14), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{i32* %incdec.ptr}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !21
   %0 = load i32* %b.addr.01, align 4, !dbg !21
   store i32 %0, i32* %arrayidx.phi, align 4, !dbg !21
   %inc = add nsw i32 %i.02, 1, !dbg !26
-  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !15), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !26
   %exitcond = icmp eq i32 %inc, 10, !dbg !19
   %arrayidx.inc = getelementptr i32* %arrayidx.phi, i32 1
   br i1 %exitcond, label %for.end, label %for.body, !dbg !19
@@ -31,34 +31,34 @@ for.end:                                          ; preds = %for.body
   ret void, !dbg !27
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{i32 786449, metadata !28, i32 12, metadata !"QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)\001\00\000\00\001", metadata !28, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c] [DW_LANG_C99]
 !2 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !28, null, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32*)* @foo, null, null, metadata !11, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!6 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\001\001", metadata !28, null, metadata !7, null, void (i32*, i32*)* @foo, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!6 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9, metadata !9}
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
-!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
+!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !11 = metadata !{metadata !13, metadata !14, metadata !15}
-!13 = metadata !{i32 786689, metadata !5, metadata !"a", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 1]
-!14 = metadata !{i32 786689, metadata !5, metadata !"b", metadata !6, i32 33554433, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 1]
-!15 = metadata !{i32 786688, metadata !16, metadata !"i", metadata !6, i32 2, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2]
-!16 = metadata !{i32 786443, metadata !28, metadata !5, i32 1, i32 26, i32 0} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
+!13 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!14 = metadata !{metadata !"0x101\00b\0033554433\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [b] [line 1]
+!15 = metadata !{metadata !"0x100\00i\002\000", metadata !16, metadata !6, metadata !10} ; [ DW_TAG_auto_variable ] [i] [line 2]
+!16 = metadata !{metadata !"0xb\001\0026\000", metadata !28, metadata !5} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
 !17 = metadata !{i32 1, i32 15, metadata !5, null}
 !18 = metadata !{i32 1, i32 23, metadata !5, null}
 !19 = metadata !{i32 3, i32 8, metadata !20, null}
-!20 = metadata !{i32 786443, metadata !28, metadata !16, i32 3, i32 3, i32 1} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
+!20 = metadata !{metadata !"0xb\003\003\001", metadata !28, metadata !16} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
 !21 = metadata !{i32 4, i32 5, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !28, metadata !20, i32 3, i32 28, i32 2} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
+!22 = metadata !{metadata !"0xb\003\0028\002", metadata !28, metadata !20} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
 !26 = metadata !{i32 3, i32 23, metadata !20, null}
 !27 = metadata !{i32 6, i32 1, metadata !16, null}
 !28 = metadata !{metadata !"hwloop-dbg.c", metadata !"/usr2/kparzysz/s.hex/t"}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !30 = metadata !{i32 0}
diff --git a/test/CodeGen/Inputs/DbgValueOtherTargets.ll b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
index 953e576..2d05b45 100644
--- a/test/CodeGen/Inputs/DbgValueOtherTargets.ll
+++ b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
@@ -3,28 +3,28 @@
 define i32 @main() nounwind ssp {
 entry:
 ; CHECK: DEBUG_VALUE
-  call void @llvm.dbg.value(metadata !6, i64 0, metadata !7), !dbg !9
+  call void @llvm.dbg.value(metadata !6, i64 0, metadata !7, metadata !{metadata !"0x102"}), !dbg !9
   ret i32 0, !dbg !10
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!13}
 
-!0 = metadata !{i32 786478, metadata !12, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !12} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !12, i32 12, metadata !"clang version 2.9 (trunk 120996)", i1 false, metadata !"", i32 0, metadata !6, metadata !6, metadata !11, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00main\00main\00\002\000\001\000\006\000\000\000", metadata !12, metadata !1, metadata !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 120996)\000\00\000\00\000", metadata !12, metadata !6, metadata !6, metadata !11, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, metadata !12, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !12, metadata !2} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 0}
-!7 = metadata !{i32 786688, metadata !8, metadata !"i", metadata !1, i32 3, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!8 = metadata !{i32 786443, metadata !12, metadata !0, i32 2, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{metadata !"0x100\00i\003\000", metadata !8, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!8 = metadata !{metadata !"0xb\002\0012\000", metadata !12, metadata !0} ; [ DW_TAG_lexical_block ]
 !9 = metadata !{i32 3, i32 11, metadata !8, null}
 !10 = metadata !{i32 4, i32 2, metadata !8, null}
 !11 = metadata !{metadata !0}
 !12 = metadata !{metadata !"/tmp/x.c", metadata !"/Users/manav"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/MSP430/asm-clobbers.ll b/test/CodeGen/MSP430/asm-clobbers.ll
new file mode 100644
index 0000000..216a3fe
--- /dev/null
+++ b/test/CodeGen/MSP430/asm-clobbers.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:16:16-i32:16:32-a:16-n8:16"
+target triple = "msp430---elf"
+
+define void @test() {
+entry:
+; CHECK-LABEL: test:
+; CHECK: push.w r10
+  call void asm sideeffect "", "~{r10}"()
+; CHECK: pop.w r10
+  ret void
+}
diff --git a/test/CodeGen/MSP430/memset.ll b/test/CodeGen/MSP430/memset.ll
new file mode 100644
index 0000000..bf10544
--- /dev/null
+++ b/test/CodeGen/MSP430/memset.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
+target triple = "msp430---elf"
+
+@buf = external global i8*
+
+; Function Attrs: nounwind
+define void @test() nounwind {
+entry:
+; CHECK-LABEL: test:
+  %0 = load i8** @buf, align 2
+; CHECK: mov.w &buf, r15
+; CHECK-NEXT: mov.w #5, r14
+; CHECK-NEXT: mov.w #128, r13
+; CHECK-NEXT: call #memset
+  call void @llvm.memset.p0i8.i16(i8* %0, i8 5, i16 128, i32 1, i1 false)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i16(i8* nocapture, i8, i16, i32, i1) nounwind
+
diff --git a/test/CodeGen/Mips/Fast-ISel/br1.ll b/test/CodeGen/Mips/Fast-ISel/br1.ll
new file mode 100644
index 0000000..579a77f
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/br1.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@b = global i32 1, align 4
+@i = global i32 0, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+; Function Attrs: nounwind
+define void @br() #0 {
+entry:
+  %0 = load i32* @b, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i32 6754, i32* @i, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+; FIXME: This instruction is redundant.
+; CHECK:  xor  $[[REG1:[0-9]+]], ${{[0-9]+}}, $zero
+; CHECK:  sltiu  $[[REG2:[0-9]+]], $[[REG1]], 1
+; CHECK:  bgtz  $[[REG2]], $BB[[BL:[0-9]+_[0-9]+]]
+; CHECK:  nop
+; CHECK:  addiu  ${{[0-9]+}}, $zero, 6754
+; CHECK:  sw  ${{[0-9]+}}, 0(${{[0-9]+}})
+; CHECK: $BB[[BL]]:
+
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/Fast-ISel/callabi.ll b/test/CodeGen/Mips/Fast-ISel/callabi.ll
new file mode 100644
index 0000000..44b94bb
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/callabi.ll
@@ -0,0 +1,477 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s -check-prefix=mips32r2
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s -check-prefix=mips32
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s -check-prefix=CHECK2
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s -check-prefix=CHECK2
+
+
+@c1 = global i8 -45, align 1
+@uc1 = global i8 27, align 1
+@s1 = global i16 -1789, align 2
+@us1 = global i16 1256, align 2
+
+; Function Attrs: nounwind
+define void @cxi() #0 {
+entry:
+; CHECK-LABEL:  cxi
+  call void @xi(i32 10)
+; CHECK-DAG:    addiu   $4, $zero, 10
+; CHECK-DAG:    lw      $25, %got(xi)(${{[0-9]+}})
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xi(i32) #1
+
+; Function Attrs: nounwind
+define void @cxii() #0 {
+entry:
+; CHECK-LABEL:  cxii
+  call void @xii(i32 746, i32 892)
+; CHECK-DAG:    addiu   $4, $zero, 746
+; CHECK-DAG:    addiu   $5, $zero, 892
+; CHECK-DAG:    lw      $25, %got(xii)(${{[0-9]+}})
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xii(i32, i32) #1
+
+; Function Attrs: nounwind
+define void @cxiii() #0 {
+entry:
+; CHECK-LABEL:  cxiii
+  call void @xiii(i32 88, i32 44, i32 11)
+; CHECK-DAG:    addiu   $4, $zero, 88
+; CHECK-DAG:    addiu   $5, $zero, 44
+; CHECK-DAG:    addiu   $6, $zero, 11
+; CHECK-DAG:    lw      $25, %got(xiii)(${{[0-9]+}})
+; CHECK:        jalr    $25
+  ret void
+}
+
+declare void @xiii(i32, i32, i32) #1
+
+; Function Attrs: nounwind
+define void @cxiiii() #0 {
+entry:
+; CHECK-LABEL:  cxiiii
+  call void @xiiii(i32 167, i32 320, i32 97, i32 14)
+; CHECK-DAG:    addiu   $4, $zero, 167
+; CHECK-DAG:    addiu   $5, $zero, 320
+; CHECK-DAG:    addiu   $6, $zero, 97
+; CHECK-DAG:    addiu   $7, $zero, 14
+; CHECK-DAG:    lw      $25, %got(xiiii)(${{[0-9]+}})
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xiiii(i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind
+define void @cxiiiiconv() #0 {
+entry:
+; CHECK-LABEL: cxiiiiconv
+; mips32r2-LABEL:  cxiiiiconv
+; mips32-LABEL:  cxiiiiconv
+  %0 = load i8* @c1, align 1
+  %conv = sext i8 %0 to i32
+  %1 = load i8* @uc1, align 1
+  %conv1 = zext i8 %1 to i32
+  %2 = load i16* @s1, align 2
+  %conv2 = sext i16 %2 to i32
+  %3 = load i16* @us1, align 2
+  %conv3 = zext i16 %3 to i32
+  call void @xiiii(i32 %conv, i32 %conv1, i32 %conv2, i32 %conv3)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32r2:     addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32:       addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32r2-DAG:         lw      $[[REG_C1_ADDR:[0-9]+]], %got(c1)($[[REG_GP]])
+; mips32r2-DAG: lbu     $[[REG_C1:[0-9]+]], 0($[[REG_C1_ADDR]])
+; mips32r2-DAG  seb     $3, $[[REG_C1]]
+; mips32-DAG:   lw      $[[REG_C1_ADDR:[0-9]+]], %got(c1)($[[REG_GP]])
+; mips32-DAG:   lbu     $[[REG_C1:[0-9]+]], 0($[[REG_C1_ADDR]])
+; mips32-DAG:   sll     $[[REG_C1_1:[0-9]+]], $[[REG_C1]], 24
+; mips32-DAG:   sra     $4, $[[REG_C1_1]], 24
+; CHECK-DAG:    lw      $[[REG_UC1_ADDR:[0-9]+]], %got(uc1)($[[REG_GP]])
+; CHECK-DAG:    lbu     $[[REG_UC1:[0-9]+]], 0($[[REG_UC1_ADDR]])
+; FIXME andi is superfulous
+; CHECK-DAG:    andi    $5, $[[REG_UC1]], 255
+; mips32r2-DAG:         lw      $[[REG_S1_ADDR:[0-9]+]], %got(s1)($[[REG_GP]])
+; mips32r2-DAG: lhu     $[[REG_S1:[0-9]+]], 0($[[REG_S1_ADDR]])
+; mips32r2-DAG: seh     $6, $[[REG_S1]]
+; mips32-DAG:   lw      $[[REG_S1_ADDR:[0-9]+]], %got(s1)($[[REG_GP]])
+; mips32-DAG:   lhu     $[[REG_S1:[0-9]+]], 0($[[REG_S1_ADDR]])
+; mips32-DAG:   sll     $[[REG_S1_1:[0-9]+]], $[[REG_S1]], 16
+; mips32-DAG:   sra     $6, $[[REG_S1_1]], 16
+; CHECK-DAG:    lw      $[[REG_US1_ADDR:[0-9]+]], %got(us1)($[[REG_GP]])
+; CHECK-DAG:    lhu     $[[REG_US1:[0-9]+]], 0($[[REG_US1_ADDR]])
+; FIXME andi is superfulous
+; CHECK-DAG:    andi    $7, $[[REG_US1]], 65535
+; mips32r2:     jalr    $25
+; mips32r2:     jalr    $25
+; CHECK:        jalr    $25
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @cxf() #0 {
+entry:
+; CHECK-LABEL:  cxf
+  call void @xf(float 0x40BBC85560000000)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK:        lui     $[[REG_FPCONST_1:[0-9]+]], 17886
+; CHECK:        ori     $[[REG_FPCONST:[0-9]+]], $[[REG_FPCONST_1]], 17067
+; CHECK: mtc1   $[[REG_FPCONST]], $f12
+; CHECK:        lw      $25, %got(xf)($[[REG_GP]])
+; CHECK:        jalr    $25
+  ret void
+}
+
+declare void @xf(float) #1
+
+; Function Attrs: nounwind
+define void @cxff() #0 {
+entry:
+; CHECK-LABEL:  cxff
+  call void @xff(float 0x3FF74A6CA0000000, float 0x401A2C0840000000)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    lui     $[[REG_FPCONST_1:[0-9]+]], 16314
+; CHECK-DAG:    ori     $[[REG_FPCONST:[0-9]+]], $[[REG_FPCONST_1]], 21349
+; CHECK-DAG: mtc1       $[[REG_FPCONST]], $f12
+; CHECK-DAG:    lui     $[[REG_FPCONST_2:[0-9]+]], 16593
+; CHECK-DAG:    ori     $[[REG_FPCONST_3:[0-9]+]], $[[REG_FPCONST_2]], 24642
+; CHECK-DAG: mtc1       $[[REG_FPCONST_3]], $f14
+; CHECK:        lw      $25, %got(xff)($[[REG_GP]])
+; CHECK:        jalr    $25
+  ret void
+}
+
+declare void @xff(float, float) #1
+
+; Function Attrs: nounwind
+define void @cxfi() #0 {
+entry:
+; CHECK-LABEL: cxfi
+  call void @xfi(float 0x4013906240000000, i32 102)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    lui     $[[REG_FPCONST_1:[0-9]+]], 16540
+; CHECK-DAG:    ori     $[[REG_FPCONST:[0-9]+]], $[[REG_FPCONST_1]], 33554
+; CHECK-DAG: mtc1       $[[REG_FPCONST]], $f12
+; CHECK-DAG:    addiu   $5, $zero, 102
+; CHECK:        lw      $25, %got(xfi)($[[REG_GP]])
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xfi(float, i32) #1
+
+; Function Attrs: nounwind
+define void @cxfii() #0 {
+entry:
+; CHECK-LABEL: cxfii
+  call void @xfii(float 0x405EC7EE00000000, i32 9993, i32 10922)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    lui     $[[REG_FPCONST_1:[0-9]+]], 17142
+; CHECK-DAG:    ori     $[[REG_FPCONST:[0-9]+]], $[[REG_FPCONST_1]], 16240
+; CHECK-DAG: mtc1       $[[REG_FPCONST]], $f12
+; CHECK-DAG:    addiu   $5, $zero, 9993
+; CHECK-DAG:    addiu   $6, $zero, 10922
+; CHECK:        lw      $25, %got(xfii)($[[REG_GP]])
+; CHECK:        jalr    $25
+  ret void
+}
+
+declare void @xfii(float, i32, i32) #1
+
+; Function Attrs: nounwind
+define void @cxfiii() #0 {
+entry:
+; CHECK-LABEL: cxfiii
+  call void @xfiii(float 0x405C072B20000000, i32 3948, i32 89011, i32 111222)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    lui     $[[REG_FPCONST_1:[0-9]+]], 17120
+; CHECK-DAG:    ori     $[[REG_FPCONST:[0-9]+]], $[[REG_FPCONST_1]], 14681
+; CHECK-DAG: mtc1       $[[REG_FPCONST]], $f12
+; CHECK-DAG:    addiu   $5, $zero, 3948
+; CHECK-DAG:    lui     $[[REG_I_1:[0-9]+]], 1
+; CHECK-DAG:    ori     $6, $[[REG_I_1]], 23475
+; CHECK-DAG:    lui     $[[REG_I_2:[0-9]+]], 1
+; CHECK-DAG:    ori     $7, $[[REG_I_2]], 45686
+; CHECK:        lw      $25, %got(xfiii)($[[REG_GP]])
+; CHECK:        jalr    $25
+  ret void
+}
+
+declare void @xfiii(float, i32, i32, i32) #1
+
+; Function Attrs: nounwind
+define void @cxd() #0 {
+entry:
+; mips32r2-LABEL: cxd:
+; mips32-LABEL: cxd:
+  call void @xd(double 5.994560e+02)
+; mips32:       addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32-DAG:   lui     $[[REG_FPCONST_1:[0-9]+]], 16514
+; mips32-DAG:   ori     $[[REG_FPCONST_2:[0-9]+]], $[[REG_FPCONST_1]], 48037
+; mips32-DAG:   lui     $[[REG_FPCONST_3:[0-9]+]], 58195
+; mips32-DAG:   ori     $[[REG_FPCONST_4:[0-9]+]], $[[REG_FPCONST_3]], 63439
+; mips32-DAG:    mtc1   $[[REG_FPCONST_4]], $f12
+; mips32-DAG:       mtc1        $[[REG_FPCONST_2]], $f13
+; mips32-DAG:   lw      $25, %got(xd)($[[REG_GP]])
+; mips32:       jalr    $25
+; mips32r2:     addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32r2-DAG: lui     $[[REG_FPCONST_1:[0-9]+]], 16514
+; mips32r2-DAG: ori     $[[REG_FPCONST_2:[0-9]+]], $[[REG_FPCONST_1]], 48037
+; mips32r2-DAG: lui     $[[REG_FPCONST_3:[0-9]+]], 58195
+; mips32r2-DAG: ori     $[[REG_FPCONST_4:[0-9]+]], $[[REG_FPCONST_3]], 63439
+; mips32r2-DAG: mtc1    $[[REG_FPCONST_4]], $f12
+; mips32r2-DAG: mthc1   $[[REG_FPCONST_2]], $f12
+; mips32r2-DAG: lw      $25, %got(xd)($[[REG_GP]])
+; mips32r2 :    jalr    $25
+  ret void
+}
+
+declare void @xd(double) #1
+
+; Function Attrs: nounwind
+define void @cxdd() #0 {
+; mips32r2-LABEL: cxdd:
+; mips32-LABEL: cxdd:
+entry:
+  call void @xdd(double 1.234980e+03, double 0x40F5B331F7CED917)
+; mips32:       addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32-DAG:   lui     $[[REG_FPCONST_1:[0-9]+]], 16531
+; mips32-DAG:   ori     $[[REG_FPCONST_2:[0-9]+]], $[[REG_FPCONST_1]], 19435
+; mips32-DAG:   lui     $[[REG_FPCONST_3:[0-9]+]], 34078
+; mips32-DAG:   ori     $[[REG_FPCONST_4:[0-9]+]], $[[REG_FPCONST_3]], 47186
+; mips32-DAG:   mtc1    $[[REG_FPCONST_4]], $f12
+; mips32-DAG:   mtc1    $[[REG_FPCONST_2]], $f13
+; mips32-DAG:   lui     $[[REG_FPCONST_1:[0-9]+]], 16629
+; mips32-DAG:   ori     $[[REG_FPCONST_2:[0-9]+]], $[[REG_FPCONST_1]], 45873
+; mips32-DAG:   lui     $[[REG_FPCONST_3:[0-9]+]], 63438
+; mips32-DAG:   ori     $[[REG_FPCONST_4:[0-9]+]], $[[REG_FPCONST_3]], 55575
+; mips32-DAG:   mtc1    $[[REG_FPCONST_4]], $f14
+; mips32-DAG:   mtc1    $[[REG_FPCONST_2]], $f15
+; mips32-DAG:   lw      $25, %got(xdd)($[[REG_GP]])
+; mips32:       jalr    $25
+; mips32r2:     addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32r2-DAG: lui     $[[REG_FPCONST_1:[0-9]+]], 16531
+; mips32r2-DAG: ori     $[[REG_FPCONST_2:[0-9]+]], $[[REG_FPCONST_1]], 19435
+; mips32r2-DAG: lui     $[[REG_FPCONST_3:[0-9]+]], 34078
+; mips32r2-DAG: ori     $[[REG_FPCONST_4:[0-9]+]], $[[REG_FPCONST_3]], 47186
+; mips32r2-DAG: mtc1    $[[REG_FPCONST_4]], $f12
+; mips32r2-DAG: mthc1   $[[REG_FPCONST_2]], $f12
+; mips32r2-DAG: lui     $[[REG_FPCONST_1:[0-9]+]], 16629
+; mips32r2-DAG: ori     $[[REG_FPCONST_2:[0-9]+]], $[[REG_FPCONST_1]], 45873
+; mips32r2-DAG: lui     $[[REG_FPCONST_3:[0-9]+]], 63438
+; mips32r2-DAG: ori     $[[REG_FPCONST_4:[0-9]+]], $[[REG_FPCONST_3]], 55575
+; mips32r2-DAG: mtc1    $[[REG_FPCONST_4]], $f14
+; mips32r2-DAG: mthc1   $[[REG_FPCONST_2]], $f14
+; mips32r2-DAG: lw      $25, %got(xdd)($[[REG_GP]])
+; mips32r2 :    jalr    $25
+  ret void
+}
+
+declare void @xdd(double, double) #1
+
+; Function Attrs: nounwind
+define void @cxif() #0 {
+entry:
+; CHECK-LABEL: cxif:
+  call void @xif(i32 345, float 0x407BCE5A20000000)
+; CHECK-DAG:    addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    addiu   $4, $zero, 345
+; CHECK-DAG:    lui     $[[REGF_1:[0-9]+]], 17374
+; CHECK-DAG:    ori     $[[REGF_2:[0-9]+]], $[[REGF_1]], 29393
+; CHECK-DAG:    mtc1    $[[REGF_2]], $f[[REGF_3:[0-9]+]]
+; CHECK-DAG:    mfc1    $5, $f[[REGF_3]]
+; CHECK-DAG:    lw      $25, %got(xif)($[[REG_GP]])
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xif(i32, float) #1
+
+; Function Attrs: nounwind
+define void @cxiff() #0 {
+entry:
+; CHECK-LABEL: cxiff:
+; CHECK2-LABEL: cxiff:
+  call void @xiff(i32 12239, float 0x408EDB3340000000, float 0x4013FFE5C0000000)
+; We need to do the two floating point parameters in a separate
+; check because we can't control the ordering of parts of the sequence
+;;
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK:        addiu   $4, $zero, 12239
+; CHECK2:       addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK2:       addiu   $4, $zero, 12239
+; CHECK:        lui     $[[REGF_1:[0-9]+]], 17526
+; CHECK:        ori     $[[REGF_2:[0-9]+]], $[[REGF_1]], 55706
+; CHECK:        mtc1    $[[REGF_2]], $f[[REGF_3:[0-9]+]]
+; CHECK:        mfc1    $5, $f[[REGF_3]]
+; CHECK2:       lui     $[[REGF2_1:[0-9]+]], 16543
+; CHECK2:       ori     $[[REGF2_2:[0-9]+]], $[[REGF2_1]], 65326
+; CHECK2:       mtc1    $[[REGF2_2]], $f[[REGF2_3:[0-9]+]]
+; CHECK2:       mfc1    $6, $f[[REGF2_3]]
+; CHECK:        lw      $25, %got(xiff)($[[REG_GP]])
+; CHECK2:       lw      $25, %got(xiff)($[[REG_GP]])
+; CHECK:        jalr    $25
+; CHECK2:       jalr    $25
+  ret void
+}
+
+declare void @xiff(i32, float, float) #1
+
+; Function Attrs: nounwind
+define void @cxifi() #0 {
+entry:
+; CHECK: cxifi:
+  call void @xifi(i32 887, float 0x402277CEE0000000, i32 888)
+; CHECK-DAG:    addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    addiu   $4, $zero, 887
+; CHECK-DAG:    lui     $[[REGF_1:[0-9]+]], 16659
+; CHECK-DAG:    ori     $[[REGF_2:[0-9]+]], $[[REGF_1]], 48759
+; CHECK-DAG:    mtc1    $[[REGF_2]], $f[[REGF_3:[0-9]+]]
+; CHECK-DAG:    mfc1    $5, $f[[REGF_3]]
+; CHECk-DAG:    addiu   $6, $zero, 888
+; CHECK-DAG:    lw      $25, %got(xifi)($[[REG_GP]])
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xifi(i32, float, i32) #1
+
+; Function Attrs: nounwind
+define void @cxifif() #0 {
+entry:
+; CHECK: cxifif:
+; CHECK2: cxifif:
+  call void @xifif(i32 67774, float 0x408EE0FBE0000000, i32 9991, float 0x40B15C8CC0000000)
+; CHECK-DAG:    addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    lui     $[[REGI:[0-9]+]], 1
+; CHECK-DAG:    ori     $4, $[[REGI]], 2238
+; CHECK-DAG:    lui     $[[REGF_1:[0-9]+]], 17527
+; CHECK-DAG:    ori     $[[REGF_2:[0-9]+]], $[[REGF_1]], 2015
+; CHECK-DAG:    mtc1    $[[REGF_2]], $f[[REGF_3:[0-9]+]]
+; CHECK-DAG:    mfc1    $5, $f[[REGF_3]]
+; CHECk-DAG:    addiu   $6, $zero, 888
+; CHECK2:       lui     $[[REGF2_1:[0-9]+]], 17802
+; CHECK2:       ori     $[[REGF2_2:[0-9]+]], $[[REGF2_1]], 58470
+; CHECK2:       mtc1    $[[REGF2_2]], $f[[REGF2_3:[0-9]+]]
+; CHECK2:       mfc1    $7, $f[[REGF2_3]]
+; CHECK:        lw      $25, %got(xifif)($[[REG_GP]])
+; CHECK2:       lw      $25, %got(xifif)($[[REG_GP]])
+; CHECK2:       jalr    $25
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xifif(i32, float, i32, float) #1
+
+; Function Attrs: nounwind
+define void @cxiffi() #0 {
+entry:
+; CHECK-label: cxiffi:
+; CHECK2-label: cxiffi:
+  call void @xiffi(i32 45, float 0x3FF6666660000000, float 0x408F333340000000, i32 234)
+; CHECK-DAG:    addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    addiu   $4, $zero, 45
+; CHECK2-DAG:   addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK2-DAG:   addiu   $4, $zero, 45
+; CHECK-DAG:    lui     $[[REGF_1:[0-9]+]], 16307
+; CHECK-DAG:    ori     $[[REGF_2:[0-9]+]], $[[REGF_1]], 13107
+; CHECK-DAG:    mtc1    $[[REGF_2]], $f[[REGF_3:[0-9]+]]
+; CHECK-DAG:    mfc1    $5, $f[[REGF_3]]
+; CHECK2:       lui     $[[REGF2_1:[0-9]+]], 17529
+; CHECK2:       ori     $[[REGF2_2:[0-9]+]], $[[REGF2_1]], 39322
+; CHECK2:       mtc1    $[[REGF2_2]], $f[[REGF2_3:[0-9]+]]
+; CHECK2:       mfc1    $6, $f[[REGF2_3]]
+; CHECK-DAG:    lw      $25, %got(xiffi)($[[REG_GP]])
+; CHECK-DAG:    addiu   $7, $zero, 234
+; CHECK2-DAG:   lw      $25, %got(xiffi)($[[REG_GP]])
+; CHECK:        jalr    $25
+; CHECK2:       jalr    $25
+
+  ret void
+}
+
+declare void @xiffi(i32, float, float, i32) #1
+
+; Function Attrs: nounwind
+define void @cxifii() #0 {
+entry:
+; CHECK-DAG:    cxifii:
+  call void @xifii(i32 12239, float 0x408EDB3340000000, i32 998877, i32 1234)
+; CHECK-DAG:    addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    addiu   $4, $zero, 12239
+; CHECK-DAG:    lui     $[[REGF_1:[0-9]+]], 17526
+; CHECK-DAG:    ori     $[[REGF_2:[0-9]+]], $[[REGF_1]], 55706
+; CHECK-DAG:    mtc1    $[[REGF_2]], $f[[REGF_3:[0-9]+]]
+; CHECK-DAG:    mfc1    $5, $f[[REGF_3]]
+; CHECK-DAG:    lui     $[[REGI2:[0-9]+]], 15
+; CHECK-DAG:    ori     $6, $[[REGI2]], 15837
+; CHECk-DAG:    addiu   $7, $zero, 1234
+; CHECK-DAG:    lw      $25, %got(xifii)($[[REG_GP]])
+; CHECK:        jalr    $25
+  ret void
+}
+
+declare void @xifii(i32, float, i32, i32) #1
+
+; FIXME: this function will not pass yet. 
+; Function Attrs: nounwind
+; define void @cxfid() #0 {
+;entry:
+;  call void @xfid(float 0x4013B851E0000000, i32 811123, double 0x40934BFF487FCB92)
+;  ret void
+;}
+
+declare void @xfid(float, i32, double) #1
+
+; Function Attrs: nounwind
+define void @g() #0 {
+entry:
+  call void @cxi()
+  call void @cxii()
+  call void @cxiii()
+  call void @cxiiii()
+  call void @cxiiiiconv()
+  call void @cxf()
+  call void @cxff()
+  call void @cxd()
+  call void @cxfi()
+  call void @cxfii()
+  call void @cxfiii()
+  call void @cxdd()
+  call void @cxif()
+  call void @cxiff()
+  call void @cxifi()
+  call void @cxifii()
+  call void @cxifif()
+  call void @cxiffi()
+  ret void
+}
+
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6.0 (gitosis@dmz-portal.mips.com:clang 43992fe7b17de5553ac06d323cb80cc6723a9ae3) (gitosis@dmz-portal.mips.com:llvm.git 0834e6839eb170197c81bb02e916258d1527e312)"}
diff --git a/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll b/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll
new file mode 100644
index 0000000..c72b1e7
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll
@@ -0,0 +1,254 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@f1 = common global float 0.000000e+00, align 4
+@f2 = common global float 0.000000e+00, align 4
+@b1 = common global i32 0, align 4
+@d1 = common global double 0.000000e+00, align 8
+@d2 = common global double 0.000000e+00, align 8
+
+; Function Attrs: nounwind
+define void @feq1()  {
+entry:
+  %0 = load float* @f1, align 4
+  %1 = load float* @f2, align 4
+  %cmp = fcmp oeq float %0, %1
+; CHECK-LABEL:  feq1:
+; CHECK-DAG:    lw      $[[REG_F2_GOT:[0-9]+]], %got(f2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_F1_GOT:[0-9]+]], %got(f1)(${{[0-9]+}})
+; CHECK-DAG:    lwc1    $f[[REG_F2:[0-9]+]], 0($[[REG_F2_GOT]])
+; CHECK-DAG:    lwc1    $f[[REG_F1:[0-9]+]], 0($[[REG_F1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.eq.s  $f[[REG_F1]], $f[[REG_F2]]
+; CHECK:        movt  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fne1()  {
+entry:
+  %0 = load float* @f1, align 4
+  %1 = load float* @f2, align 4
+  %cmp = fcmp une float %0, %1
+; CHECK-LABEL:  fne1:
+; CHECK-DAG:    lw      $[[REG_F2_GOT:[0-9]+]], %got(f2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_F1_GOT:[0-9]+]], %got(f1)(${{[0-9]+}})
+; CHECK-DAG:    lwc1    $f[[REG_F2:[0-9]+]], 0($[[REG_F2_GOT]])
+; CHECK-DAG:    lwc1    $f[[REG_F1:[0-9]+]], 0($[[REG_F1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.eq.s  $f[[REG_F1]], $f[[REG_F2]]
+; CHECK:        movf  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @flt1()  {
+entry:
+  %0 = load float* @f1, align 4
+  %1 = load float* @f2, align 4
+  %cmp = fcmp olt float %0, %1
+; CHECK-LABEL:  flt1:
+; CHECK-DAG:    lw      $[[REG_F2_GOT:[0-9]+]], %got(f2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_F1_GOT:[0-9]+]], %got(f1)(${{[0-9]+}})
+; CHECK-DAG:    lwc1    $f[[REG_F2:[0-9]+]], 0($[[REG_F2_GOT]])
+; CHECK-DAG:    lwc1    $f[[REG_F1:[0-9]+]], 0($[[REG_F1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.olt.s  $f[[REG_F1]], $f[[REG_F2]]
+; CHECK:        movt  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fgt1()  {
+entry:
+  %0 = load float* @f1, align 4
+  %1 = load float* @f2, align 4
+  %cmp = fcmp ogt float %0, %1
+; CHECK-LABEL: fgt1:
+; CHECK-DAG:    lw      $[[REG_F2_GOT:[0-9]+]], %got(f2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_F1_GOT:[0-9]+]], %got(f1)(${{[0-9]+}})
+; CHECK-DAG:    lwc1    $f[[REG_F2:[0-9]+]], 0($[[REG_F2_GOT]])
+; CHECK-DAG:    lwc1    $f[[REG_F1:[0-9]+]], 0($[[REG_F1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.ule.s  $f[[REG_F1]], $f[[REG_F2]]
+; CHECK:        movf  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fle1()  {
+entry:
+  %0 = load float* @f1, align 4
+  %1 = load float* @f2, align 4
+  %cmp = fcmp ole float %0, %1
+; CHECK-LABEL:  fle1:
+; CHECK-DAG:    lw      $[[REG_F2_GOT:[0-9]+]], %got(f2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_F1_GOT:[0-9]+]], %got(f1)(${{[0-9]+}})
+; CHECK-DAG:    lwc1    $f[[REG_F2:[0-9]+]], 0($[[REG_F2_GOT]])
+; CHECK-DAG:    lwc1    $f[[REG_F1:[0-9]+]], 0($[[REG_F1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.ole.s  $f[[REG_F1]], $f[[REG_F2]]
+; CHECK:        movt  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fge1()  {
+entry:
+  %0 = load float* @f1, align 4
+  %1 = load float* @f2, align 4
+  %cmp = fcmp oge float %0, %1
+; CHECK-LABEL:  fge1:
+; CHECK-DAG:    lw      $[[REG_F2_GOT:[0-9]+]], %got(f2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_F1_GOT:[0-9]+]], %got(f1)(${{[0-9]+}})
+; CHECK-DAG:    lwc1    $f[[REG_F2:[0-9]+]], 0($[[REG_F2_GOT]])
+; CHECK-DAG:    lwc1    $f[[REG_F1:[0-9]+]], 0($[[REG_F1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.ult.s  $f[[REG_F1]], $f[[REG_F2]]
+; CHECK:        movf  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @deq1()  {
+entry:
+  %0 = load double* @d1, align 8
+  %1 = load double* @d2, align 8
+  %cmp = fcmp oeq double %0, %1
+; CHECK-LABEL:  deq1:
+; CHECK-DAG:    lw      $[[REG_D2_GOT:[0-9]+]], %got(d2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_D1_GOT:[0-9]+]], %got(d1)(${{[0-9]+}})
+; CHECK-DAG:    ldc1    $f[[REG_D2:[0-9]+]], 0($[[REG_D2_GOT]])
+; CHECK-DAG:    ldc1    $f[[REG_D1:[0-9]+]], 0($[[REG_D1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.eq.d  $f[[REG_D1]], $f[[REG_D2]]
+; CHECK:        movt  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @dne1()  {
+entry:
+  %0 = load double* @d1, align 8
+  %1 = load double* @d2, align 8
+  %cmp = fcmp une double %0, %1
+; CHECK-LABEL:  dne1:
+; CHECK-DAG:    lw      $[[REG_D2_GOT:[0-9]+]], %got(d2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_D1_GOT:[0-9]+]], %got(d1)(${{[0-9]+}})
+; CHECK-DAG:    ldc1    $f[[REG_D2:[0-9]+]], 0($[[REG_D2_GOT]])
+; CHECK-DAG:    ldc1    $f[[REG_D1:[0-9]+]], 0($[[REG_D1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.eq.d  $f[[REG_D1]], $f[[REG_D2]]
+; CHECK:        movf  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @dlt1()  {
+entry:
+  %0 = load double* @d1, align 8
+  %1 = load double* @d2, align 8
+  %cmp = fcmp olt double %0, %1
+; CHECK-LABEL:  dlt1:
+; CHECK-DAG:    lw      $[[REG_D2_GOT:[0-9]+]], %got(d2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_D1_GOT:[0-9]+]], %got(d1)(${{[0-9]+}})
+; CHECK-DAG:    ldc1    $f[[REG_D2:[0-9]+]], 0($[[REG_D2_GOT]])
+; CHECK-DAG:    ldc1    $f[[REG_D1:[0-9]+]], 0($[[REG_D1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.olt.d  $f[[REG_D1]], $f[[REG_D2]]
+; CHECK:        movt  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @dgt1()  {
+entry:
+  %0 = load double* @d1, align 8
+  %1 = load double* @d2, align 8
+  %cmp = fcmp ogt double %0, %1
+; CHECK-LABEL:  dgt1:
+; CHECK-DAG:    lw      $[[REG_D2_GOT:[0-9]+]], %got(d2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_D1_GOT:[0-9]+]], %got(d1)(${{[0-9]+}})
+; CHECK-DAG:    ldc1    $f[[REG_D2:[0-9]+]], 0($[[REG_D2_GOT]])
+; CHECK-DAG:    ldc1    $f[[REG_D1:[0-9]+]], 0($[[REG_D1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.ule.d  $f[[REG_D1]], $f[[REG_D2]]
+; CHECK:        movf  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @dle1()  {
+entry:
+  %0 = load double* @d1, align 8
+  %1 = load double* @d2, align 8
+  %cmp = fcmp ole double %0, %1
+; CHECK-LABEL:  dle1:
+; CHECK-DAG:    lw      $[[REG_D2_GOT:[0-9]+]], %got(d2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_D1_GOT:[0-9]+]], %got(d1)(${{[0-9]+}})
+; CHECK-DAG:    ldc1    $f[[REG_D2:[0-9]+]], 0($[[REG_D2_GOT]])
+; CHECK-DAG:    ldc1    $f[[REG_D1:[0-9]+]], 0($[[REG_D1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.ole.d  $f[[REG_D1]], $f[[REG_D2]]
+; CHECK:        movt  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @dge1()  {
+entry:
+  %0 = load double* @d1, align 8
+  %1 = load double* @d2, align 8
+  %cmp = fcmp oge double %0, %1
+; CHECK-LABEL:  dge1:
+; CHECK-DAG:    lw      $[[REG_D2_GOT:[0-9]+]], %got(d2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_D1_GOT:[0-9]+]], %got(d1)(${{[0-9]+}})
+; CHECK-DAG:    ldc1    $f[[REG_D2:[0-9]+]], 0($[[REG_D2_GOT]])
+; CHECK-DAG:    ldc1    $f[[REG_D1:[0-9]+]], 0($[[REG_D1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.ult.d  $f[[REG_D1]], $f[[REG_D2]]
+; CHECK:        movf  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/Fast-ISel/fpext.ll b/test/CodeGen/Mips/Fast-ISel/fpext.ll
new file mode 100644
index 0000000..98aca75
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/fpext.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@f = global float 0x40147E6B80000000, align 4
+@d_f = common global double 0.000000e+00, align 8
+@.str = private unnamed_addr constant [6 x i8] c"%f  \0A\00", align 1
+
+; Function Attrs: nounwind
+define void @dv() #0 {
+entry:
+  %0 = load float* @f, align 4
+  %conv = fpext float %0 to double
+; CHECK: cvt.d.s  $f{{[0-9]+}}, $f{{[0-9]+}}
+  store double %conv, double* @d_f, align 8
+  ret void
+}
+
+
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/Mips/Fast-ISel/fpintconv.ll b/test/CodeGen/Mips/Fast-ISel/fpintconv.ll
new file mode 100644
index 0000000..846726a
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/fpintconv.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+
+@f = global float 0x40D6E83280000000, align 4
+@d = global double 0x4132D68780000000, align 8
+@i_f = common global i32 0, align 4
+@i_d = common global i32 0, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+; Function Attrs: nounwind
+define void @ifv() {
+entry:
+; CHECK-LABEL:   .ent  ifv
+  %0 = load float* @f, align 4
+  %conv = fptosi float %0 to i32
+; CHECK:   trunc.w.s  $f[[REG:[0-9]+]], $f{{[0-9]+}}
+; CHECK:   mfc1	${{[0-9]+}}, $f[[REG]]
+  store i32 %conv, i32* @i_f, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @idv() {
+entry:
+; CHECK-LABEL:   .ent  idv
+  %0 = load double* @d, align 8
+  %conv = fptosi double %0 to i32
+; CHECK:   trunc.w.d  $f[[REG:[0-9]+]], $f{{[0-9]+}}
+; CHECK:   mfc1	${{[0-9]+}}, $f[[REG]]
+  store i32 %conv, i32* @i_d, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/Fast-ISel/fptrunc.ll b/test/CodeGen/Mips/Fast-ISel/fptrunc.ll
new file mode 100644
index 0000000..d843dee
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/fptrunc.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@d = global double 0x40147E6B74DF0446, align 8
+@f = common global float 0.000000e+00, align 4
+@.str = private unnamed_addr constant [6 x i8] c"%f  \0A\00", align 1
+
+; Function Attrs: nounwind
+define void @fv() #0 {
+entry:
+  %0 = load double* @d, align 8
+  %conv = fptrunc double %0 to float
+; CHECK: cvt.s.d  $f{{[0-9]+}}, $f{{[0-9]+}}
+  store float %conv, float* @f, align 4
+  ret void
+}
+
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/Mips/Fast-ISel/icmpa.ll b/test/CodeGen/Mips/Fast-ISel/icmpa.ll
new file mode 100644
index 0000000..bd41a29
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/icmpa.ll
@@ -0,0 +1,210 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@c = global i32 4, align 4
+@d = global i32 9, align 4
+@uc = global i32 4, align 4
+@ud = global i32 9, align 4
+@b1 = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @eq()  {
+entry:
+; CHECK-LABEL:  .ent  eq
+
+  %0 = load i32* @c, align 4
+  %1 = load i32* @d, align 4
+  %cmp = icmp eq i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_D_GOT:[0-9+]]], %got(d)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_C_GOT:[0-9+]]], %got(c)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_D:[0-9]+]], 0($[[REG_D_GOT]])
+; CHECK-DAG:  lw	$[[REG_C:[0-9]+]], 0($[[REG_C_GOT]])
+; CHECK:  xor  $[[REG1:[0-9]+]], $[[REG_C]], $[[REG_D]]
+; CHECK:  sltiu  $[[REG2:[0-9]+]], $[[REG1]], 1
+; FIXME: This instruction is redundant. The sltiu can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG2]], 1
+
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ne()  {
+entry:
+; CHECK-LABEL:  .ent  ne
+  %0 = load i32* @c, align 4
+  %1 = load i32* @d, align 4
+  %cmp = icmp ne i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_D_GOT:[0-9+]]], %got(d)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_C_GOT:[0-9+]]], %got(c)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_D:[0-9]+]], 0($[[REG_D_GOT]])
+; CHECK-DAG:  lw	$[[REG_C:[0-9]+]], 0($[[REG_C_GOT]])
+; CHECK:  xor  $[[REG1:[0-9]+]], $[[REG_C]], $[[REG_D]]
+; CHECK:  sltu  $[[REG2:[0-9]+]], $zero, $[[REG1]]
+; FIXME: This instruction is redundant. The sltu can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG2]], 1
+
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ugt()  {
+entry:
+; CHECK-LABEL:  .ent  ugt
+  %0 = load i32* @uc, align 4
+  %1 = load i32* @ud, align 4
+  %cmp = icmp ugt i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_UD_GOT:[0-9+]]], %got(ud)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UC_GOT:[0-9+]]], %got(uc)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UD:[0-9]+]], 0($[[REG_UD_GOT]])
+; CHECK-DAG:  lw	$[[REG_UC:[0-9]+]], 0($[[REG_UC_GOT]])
+; CHECK:  sltu  $[[REG1:[0-9]+]], $[[REG_UD]], $[[REG_UC]]
+; FIXME: This instruction is redundant. The sltu can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 1
+
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ult()  {
+entry:
+; CHECK-LABEL:  .ent  ult
+  %0 = load i32* @uc, align 4
+  %1 = load i32* @ud, align 4
+  %cmp = icmp ult i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_UD_GOT:[0-9+]]], %got(ud)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UC_GOT:[0-9+]]], %got(uc)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UD:[0-9]+]], 0($[[REG_UD_GOT]])
+; CHECK-DAG:  lw	$[[REG_UC:[0-9]+]], 0($[[REG_UC_GOT]])
+; CHECK:  sltu  $[[REG1:[0-9]+]], $[[REG_UC]], $[[REG_UD]]
+; FIXME: This instruction is redundant. The sltu can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 1
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @uge()  {
+entry:
+; CHECK-LABEL:  .ent  uge
+  %0 = load i32* @uc, align 4
+  %1 = load i32* @ud, align 4
+  %cmp = icmp uge i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_UD_GOT:[0-9+]]], %got(ud)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UC_GOT:[0-9+]]], %got(uc)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UD:[0-9]+]], 0($[[REG_UD_GOT]])
+; CHECK-DAG:  lw	$[[REG_UC:[0-9]+]], 0($[[REG_UC_GOT]])
+; CHECK:  sltu  $[[REG1:[0-9]+]], $[[REG_UC]], $[[REG_UD]]
+; CHECK:  xori  $[[REG2:[0-9]+]], $[[REG1]], 1
+; FIXME: This instruction is redundant. The sltu can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG2]], 1
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ule()  {
+entry:
+; CHECK-LABEL:  .ent  ule
+  %0 = load i32* @uc, align 4
+  %1 = load i32* @ud, align 4
+  %cmp = icmp ule i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_UD_GOT:[0-9+]]], %got(ud)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UC_GOT:[0-9+]]], %got(uc)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UD:[0-9]+]], 0($[[REG_UD_GOT]])
+; CHECK-DAG:  lw	$[[REG_UC:[0-9]+]], 0($[[REG_UC_GOT]])
+; CHECK:  sltu  $[[REG1:[0-9]+]], $[[REG_UD]], $[[REG_UC]]
+; CHECK:  xori  $[[REG2:[0-9]+]], $[[REG1]], 1
+; FIXME: This instruction is redundant. The sltu can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG2]], 1
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @sgt()  {
+entry:
+; CHECK-LABEL:  .ent sgt
+  %0 = load i32* @c, align 4
+  %1 = load i32* @d, align 4
+  %cmp = icmp sgt i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_D_GOT:[0-9+]]], %got(d)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_C_GOT:[0-9+]]], %got(c)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_D:[0-9]+]], 0($[[REG_D_GOT]])
+; CHECK-DAG:  lw	$[[REG_C:[0-9]+]], 0($[[REG_C_GOT]])
+; CHECK:  slt  $[[REG1:[0-9]+]], $[[REG_D]], $[[REG_C]]
+; FIXME: This instruction is redundant. The slt can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 1
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @slt()  {
+entry:
+; CHECK-LABEL:  .ent slt
+  %0 = load i32* @c, align 4
+  %1 = load i32* @d, align 4
+  %cmp = icmp slt i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_D_GOT:[0-9+]]], %got(d)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_C_GOT:[0-9+]]], %got(c)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_D:[0-9]+]], 0($[[REG_D_GOT]])
+; CHECK-DAG:  lw	$[[REG_C:[0-9]+]], 0($[[REG_C_GOT]])
+; CHECK:  slt  $[[REG1:[0-9]+]], $[[REG_C]], $[[REG_D]]
+; FIXME: This instruction is redundant. The slt can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 1
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @sge()  {
+entry:
+; CHECK-LABEL:  .ent sge
+  %0 = load i32* @c, align 4
+  %1 = load i32* @d, align 4
+  %cmp = icmp sge i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+; CHECK-DAG:  lw	$[[REG_D_GOT:[0-9+]]], %got(d)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_C_GOT:[0-9+]]], %got(c)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_D:[0-9]+]], 0($[[REG_D_GOT]])
+; CHECK-DAG:  lw	$[[REG_C:[0-9]+]], 0($[[REG_C_GOT]])
+; CHECK:  slt  $[[REG1:[0-9]+]], $[[REG_C]], $[[REG_D]]
+; CHECK:  xori  $[[REG2:[0-9]+]], $[[REG1]], 1
+; FIXME: This instruction is redundant. The slt can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG2]], 1
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @sle()  {
+entry:
+; CHECK-LABEL:  .ent sle
+  %0 = load i32* @c, align 4
+  %1 = load i32* @d, align 4
+  %cmp = icmp sle i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_D_GOT:[0-9+]]], %got(d)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_C_GOT:[0-9+]]], %got(c)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_D:[0-9]+]], 0($[[REG_D_GOT]])
+; CHECK-DAG:  lw	$[[REG_C:[0-9]+]], 0($[[REG_C_GOT]])
+; CHECK:        slt     $[[REG1:[0-9]+]], $[[REG_D]], $[[REG_C]]
+; CHECK:        xori    $[[REG2:[0-9]+]], $[[REG1]], 1
+; FIXME: This instruction is redundant. The slt can only produce 0 and 1.
+; CHECK:        andi    ${{[0-9]+}}, $[[REG2]], 1
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/Fast-ISel/loadstore2.ll b/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
index f113a0e..d84478b 100644
--- a/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
+++ b/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
@@ -6,6 +6,8 @@ target triple = "mips--linux-gnu"
 @c1 = common global i8 0, align 1
 ; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
 
 @s2 = common global i16 0, align 2
 @s1 = common global i16 0, align 2
diff --git a/test/CodeGen/Mips/Fast-ISel/loadstoreconv.ll b/test/CodeGen/Mips/Fast-ISel/loadstoreconv.ll
new file mode 100644
index 0000000..f7f2c64
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/loadstoreconv.ll
@@ -0,0 +1,179 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s -check-prefix=mips32r2
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s -check-prefix=mips32
+
+@b2 = global i8 0, align 1
+@b1 = global i8 1, align 1
+@uc1 = global i8 0, align 1
+@uc2 = global i8 -1, align 1
+@sc1 = global i8 -128, align 1
+@sc2 = global i8 127, align 1
+@ss1 = global i16 -32768, align 2
+@ss2 = global i16 32767, align 2
+@us1 = global i16 0, align 2
+@us2 = global i16 -1, align 2
+@ssi = global i16 0, align 2
+@ssj = global i16 0, align 2
+@i = global i32 0, align 4
+@j = global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
+@.str1 = private unnamed_addr constant [7 x i8] c"%i %i\0A\00", align 1
+
+; Function Attrs: nounwind
+define void @_Z3b_iv()  {
+entry:
+; CHECK-LABEL:   .ent  _Z3b_iv
+  %0 = load i8* @b1, align 1
+  %tobool = trunc i8 %0 to i1
+  %frombool = zext i1 %tobool to i8
+  store i8 %frombool, i8* @b2, align 1
+  %1 = load i8* @b2, align 1
+  %tobool1 = trunc i8 %1 to i1
+  %conv = zext i1 %tobool1 to i32
+  store i32 %conv, i32* @i, align 4
+; CHECK:  lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:  andi  $[[REG2:[0-9]+]], $[[REG1]], 1
+; CHECK:  sb  $[[REG2]], 0(${{[0-9]+}})
+
+
+
+  ret void
+; CHECK:   .end  _Z3b_iv
+}
+
+; Function Attrs: nounwind
+define void @_Z4uc_iv()  {
+entry:
+; CHECK-LABEL:  .ent  _Z4uc_iv
+
+  %0 = load i8* @uc1, align 1
+  %conv = zext i8 %0 to i32
+  store i32 %conv, i32* @i, align 4
+  %1 = load i8* @uc2, align 1
+  %conv1 = zext i8 %1 to i32
+; CHECK:   lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 255
+
+  store i32 %conv1, i32* @j, align 4
+  ret void
+; CHECK:  .end  _Z4uc_iv
+
+}
+
+; Function Attrs: nounwind
+define void @_Z4sc_iv()  {
+entry:
+; mips32r2-LABEL:  .ent  _Z4sc_iv
+; mips32-LABEL:  .ent  _Z4sc_iv
+
+  %0 = load i8* @sc1, align 1
+  %conv = sext i8 %0 to i32
+  store i32 %conv, i32* @i, align 4
+  %1 = load i8* @sc2, align 1
+  %conv1 = sext i8 %1 to i32
+  store i32 %conv1, i32* @j, align 4
+; mips32r2:  lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; mips32r2:  seb  ${{[0-9]+}}, $[[REG1]]
+; mips32:  lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; mips32:    sll  $[[REG2:[0-9]+]], $[[REG1]], 24
+; mips32:    sra  ${{[0-9]+}}, $[[REG2]], 24
+
+  ret void
+; CHECK:  .end  _Z4sc_iv
+}
+
+; Function Attrs: nounwind
+define void @_Z4us_iv()  {
+entry:
+; CHECK-LABEL:  .ent  _Z4us_iv
+  %0 = load i16* @us1, align 2
+  %conv = zext i16 %0 to i32
+  store i32 %conv, i32* @i, align 4
+  %1 = load i16* @us2, align 2
+  %conv1 = zext i16 %1 to i32
+  store i32 %conv1, i32* @j, align 4
+  ret void
+; CHECK:  lhu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 65535
+; CHECK:  .end  _Z4us_iv
+}
+
+; Function Attrs: nounwind
+define void @_Z4ss_iv()  {
+entry:
+; mips32r2-LABEL:  .ent  _Z4ss_iv
+; mips32=LABEL:  .ent  _Z4ss_iv
+
+  %0 = load i16* @ss1, align 2
+  %conv = sext i16 %0 to i32
+  store i32 %conv, i32* @i, align 4
+  %1 = load i16* @ss2, align 2
+  %conv1 = sext i16 %1 to i32
+  store i32 %conv1, i32* @j, align 4
+; mips32r2:  lhu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; mips32r2:  seh  ${{[0-9]+}}, $[[REG1]]
+; mips32:    lhu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; mips32:    sll  $[[REG2:[0-9]+]], $[[REG1]], 16
+; mips32:    sra  ${{[0-9]+}}, $[[REG2]], 16
+
+  ret void
+; CHECK:  .end  _Z4ss_iv
+}
+
+; Function Attrs: nounwind
+define void @_Z4b_ssv()  {
+entry:
+; CHECK-LABEL:  .ent  _Z4b_ssv
+  %0 = load i8* @b2, align 1
+  %tobool = trunc i8 %0 to i1
+  %conv = zext i1 %tobool to i16
+  store i16 %conv, i16* @ssi, align 2
+  ret void
+; CHECK:  lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 1
+; CHECK:  .end  _Z4b_ssv
+}
+
+; Function Attrs: nounwind
+define void @_Z5uc_ssv()  {
+entry:
+; CHECK-LABEL:  .ent  _Z5uc_ssv
+  %0 = load i8* @uc1, align 1
+  %conv = zext i8 %0 to i16
+  store i16 %conv, i16* @ssi, align 2
+  %1 = load i8* @uc2, align 1
+  %conv1 = zext i8 %1 to i16
+; CHECK:   lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 255
+
+  store i16 %conv1, i16* @ssj, align 2
+  ret void
+; CHECK:  .end  _Z5uc_ssv
+}
+
+; Function Attrs: nounwind
+define void @_Z5sc_ssv()  {
+entry:
+; mips32r2-LABEL:  .ent  _Z5sc_ssv
+; mips32-LABEL:  .ent  _Z5sc_ssv
+  %0 = load i8* @sc1, align 1
+  %conv = sext i8 %0 to i16
+  store i16 %conv, i16* @ssi, align 2
+  %1 = load i8* @sc2, align 1
+  %conv1 = sext i8 %1 to i16
+  store i16 %conv1, i16* @ssj, align 2
+; mips32r2:  lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; mips32r2:  seb  ${{[0-9]+}}, $[[REG1]]
+; mips32:  lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; mips32:    sll  $[[REG2:[0-9]+]], $[[REG1]], 24
+; mips32:    sra  ${{[0-9]+}}, $[[REG2]], 24
+
+  ret void
+; CHECK:  .end  _Z5sc_ssv
+}
+
diff --git a/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll b/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll
new file mode 100644
index 0000000..93cf4c1
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@.str = private unnamed_addr constant [6 x i8] c"hello\00", align 1
+@s = common global i8* null, align 4
+
+; Function Attrs: nounwind
+define void @foo() #0 {
+entry:
+  store i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), i8** @s, align 4
+  ret void
+; CHECK:        .ent    foo
+; CHECK:        lw      $[[REG1:[0-9]+]], %got($.str)(${{[0-9]+}})
+; CHECK:        addiu   ${{[0-9]+}}, $[[REG1]], %lo($.str)
+
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
diff --git a/test/CodeGen/Mips/Fast-ISel/nullvoid.ll b/test/CodeGen/Mips/Fast-ISel/nullvoid.ll
index eeaff87..c847561 100644
--- a/test/CodeGen/Mips/Fast-ISel/nullvoid.ll
+++ b/test/CodeGen/Mips/Fast-ISel/nullvoid.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
 
 ; Function Attrs: nounwind
 define void @foo() {
diff --git a/test/CodeGen/Mips/Fast-ISel/shift.ll b/test/CodeGen/Mips/Fast-ISel/shift.ll
new file mode 100644
index 0000000..18fd5ac
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/shift.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -O1 -fast-isel=true -mips-fast-isel -filetype=obj %s -o - \
+; RUN:   | llvm-objdump -arch mipsel -mcpu=mips32r2 -d - | FileCheck %s
+
+; This test checks that encoding for srl is correct when fast-isel for mips32r2 is used.
+
+%struct.s = type { [4 x i8], i32 }
+
+define i32 @main() nounwind uwtable {
+entry:
+  %foo = alloca %struct.s, align 4
+  %0 = bitcast %struct.s* %foo to i32*
+  %bf.load = load i32* %0, align 4
+  %bf.lshr = lshr i32 %bf.load, 2
+  %cmp = icmp ne i32 %bf.lshr, 2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  unreachable
+
+if.end:
+  ret i32 0
+}
+
+; CHECK: srl    ${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}
diff --git a/test/CodeGen/Mips/Fast-ISel/simplestore.ll b/test/CodeGen/Mips/Fast-ISel/simplestore.ll
index 5d52481..83e3f3f 100644
--- a/test/CodeGen/Mips/Fast-ISel/simplestore.ll
+++ b/test/CodeGen/Mips/Fast-ISel/simplestore.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
 
 @abcd = external global i32
 
diff --git a/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll b/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
index 6759c01..74723ae 100644
--- a/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
@@ -1,5 +1,11 @@
 ; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s 
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s -check-prefix=mips32r2 
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s -check-prefix=mips32
 
 @f = common global float 0.000000e+00, align 4
 @de = common global double 0.000000e+00, align 8
@@ -23,15 +29,25 @@ entry:
 define void @d1() #0 {
 entry:
   store double 1.234567e+00, double* @de, align 8
-; CHECK:  .ent  d1
-; CHECK:  lui  $[[REG1a:[0-9]+]], 16371
-; CHECK:  ori  $[[REG2a:[0-9]+]], $[[REG1a]], 49353
-; CHECK:  lui  $[[REG1b:[0-9]+]], 21403
-; CHECK:  ori  $[[REG2b:[0-9]+]], $[[REG1b]], 34951
-; CHECK:  mtc1  $[[REG2b]], $f[[REG3:[0-9]+]]
-; CHECK:  mthc1  $[[REG2a]], $f[[REG3]]
-; CHECK:  sdc1  $f[[REG3]], 0(${{[0-9]+}})
-; CHECK:  .end  d1
+; mip32r2:  .ent  d1
+; mips32r2:  lui  $[[REG1a:[0-9]+]], 16371
+; mips32r2:  ori  $[[REG2a:[0-9]+]], $[[REG1a]], 49353
+; mips32r2:  lui  $[[REG1b:[0-9]+]], 21403
+; mips32r2:  ori  $[[REG2b:[0-9]+]], $[[REG1b]], 34951
+; mips32r2:  mtc1  $[[REG2b]], $f[[REG3:[0-9]+]]
+; mips32r2:  mthc1  $[[REG2a]], $f[[REG3]]
+; mips32r2:  sdc1  $f[[REG3]], 0(${{[0-9]+}})
+; mips32r2:  .end  d1
+; mips32:  .ent  d1
+; mips32:  lui  $[[REG1a:[0-9]+]], 16371
+; mips32:  ori  $[[REG2a:[0-9]+]], $[[REG1a]], 49353
+; mips32:  lui  $[[REG1b:[0-9]+]], 21403
+; mips32:  ori  $[[REG2b:[0-9]+]], $[[REG1b]], 34951
+; mips32:  mtc1  $[[REG2b]], $f[[REG3:[0-9]+]]
+; mips32:  mtc1  $[[REG2a]], $f{{[0-9]+}}
+; mips32:  sdc1  $f[[REG3]], 0(${{[0-9]+}})
+; mips32:  .end  d1
+
   ret void
 }
 
diff --git a/test/CodeGen/Mips/Fast-ISel/simplestorei.ll b/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
index 7d2c8e7..128e1de 100644
--- a/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
+++ b/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
 
 @ijk = external global i32
 
diff --git a/test/CodeGen/Mips/abicalls.ll b/test/CodeGen/Mips/abicalls.ll
index 6fa33aa..7edc3e2 100644
--- a/test/CodeGen/Mips/abicalls.ll
+++ b/test/CodeGen/Mips/abicalls.ll
@@ -1,16 +1,11 @@
-; 
-; When the assembler is ready a .s file for it will
-; be created.
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -o - | FileCheck -check-prefix=ABICALLS -check-prefix=STATIC %s
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck -check-prefix=ABICALLS -check-prefix=PIC %s
+; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips4 -relocation-model=static %s -o - | FileCheck -check-prefix=ABICALLS -check-prefix=PIC %s
+; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefix=ABICALLS -check-prefix=PIC %s
 
-; Note that EF_MIPS_CPIC is set by -mabicalls which is the default on Linux
-; TODO need to support -mno-abicalls
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -mattr noabicalls -relocation-model=static %s -o - | FileCheck -implicit-check-not='.abicalls' -implicit-check-not='pic0' %s
 
-; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-STATIC %s
-; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck -check-prefix=CHECK-PIC %s
-; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips4 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-PIC %s
-; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-PIC %s
+; ABICALLS: .abicalls
 
-; CHECK-STATIC: .abicalls
-; CHECK-STATIC-NEXT: pic0
-; CHECK-PIC: .abicalls
-; CHECK-PIC-NOT: pic0
+; STATIC: pic0
+; PIC-NOT: pic0
diff --git a/test/CodeGen/Mips/abiflags-xx.ll b/test/CodeGen/Mips/abiflags-xx.ll
index b8aa071..c461012 100644
--- a/test/CodeGen/Mips/abiflags-xx.ll
+++ b/test/CodeGen/Mips/abiflags-xx.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -mattr=fpxx %s -o - | FileCheck %s
-; XFAIL: *
 
 ; CHECK: .nan    legacy
 ; CHECK: .module fp=xx
diff --git a/test/CodeGen/Mips/abiflags32.ll b/test/CodeGen/Mips/abiflags32.ll
index 093964f..e32d4a5 100644
--- a/test/CodeGen/Mips/abiflags32.ll
+++ b/test/CodeGen/Mips/abiflags32.ll
@@ -3,10 +3,15 @@
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips64 -mattr=-n64,n32 %s -o - | FileCheck  -check-prefix=CHECK-64n %s
 
 ; CHECK: .nan    legacy
-; CHECK: .module fp=32
+; We don't emit '.module fp=32' for compatibility with binutils 2.24 which
+; doesn't accept .module.
+; CHECK-NOT: .module fp=32
 
 ; CHECK-64: .nan    legacy
+; We do emit '.module fp=64' though since it contradicts the default value.
 ; CHECK-64: .module fp=64
 
 ; CHECK-64n: .nan    legacy
-; CHECK-64n: .module fp=64
+; We don't emit '.module fp=64' for compatibility with binutils 2.24 which
+; doesn't accept .module.
+; CHECK-64n-NOT: .module fp=64
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 066d42c..78fd829 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -8,11 +8,11 @@
 
 ; Keep one big-endian check so that we don't reduce testing, but don't add more
 ; since endianness doesn't affect the body of the atomic operations.
-; RUN: llc -march=mips   --disable-machine-licm -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=CHECK-EB
+; RUN: llc -march=mips   --disable-machine-licm -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=NO-SEB-SEH -check-prefix=CHECK-EB
 
 @x = common global i32 0, align 4
 
-define i32 @AtomicLoadAdd32(i32 %incr) nounwind {
+define i32 @AtomicLoadAdd32(i32 signext %incr) nounwind {
 entry:
   %0 = atomicrmw add i32* @x, i32 %incr monotonic
   ret i32 %0
@@ -29,7 +29,7 @@ entry:
 ; ALL:           beqz    $[[R2]], $[[BB0]]
 }
 
-define i32 @AtomicLoadNand32(i32 %incr) nounwind {
+define i32 @AtomicLoadNand32(i32 signext %incr) nounwind {
 entry:
   %0 = atomicrmw nand i32* @x, i32 %incr monotonic
   ret i32 %0
@@ -47,7 +47,7 @@ entry:
 ; ALL:           beqz    $[[R2]], $[[BB0]]
 }
 
-define i32 @AtomicSwap32(i32 %newval) nounwind {
+define i32 @AtomicSwap32(i32 signext %newval) nounwind {
 entry:
   %newval.addr = alloca i32, align 4
   store i32 %newval, i32* %newval.addr, align 4
@@ -66,7 +66,7 @@ entry:
 ; ALL:           beqz    $[[R2]], $[[BB0]]
 }
 
-define i32 @AtomicCmpSwap32(i32 %oldval, i32 %newval) nounwind {
+define i32 @AtomicCmpSwap32(i32 signext %oldval, i32 signext %newval) nounwind {
 entry:
   %newval.addr = alloca i32, align 4
   store i32 %newval, i32* %newval.addr, align 4
@@ -246,6 +246,7 @@ entry:
 ; NO-SEB-SEH:    sra     $2, $[[R17]], 24
 
 ; HAS-SEB-SEH:   seb     $2, $[[R16]]
+
 }
 
 define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwind {
@@ -292,6 +293,49 @@ entry:
 ; HAS-SEB-SEH:   seb     $2, $[[R17]]
 }
 
+define i1 @AtomicCmpSwapRes8(i8* %ptr, i8 signext %oldval, i8 signext %newval) nounwind {
+entry:
+  %0 = cmpxchg i8* %ptr, i8 %oldval, i8 %newval monotonic monotonic
+  %1 = extractvalue { i8, i1 } %0, 1
+  ret i1 %1
+; ALL-LABEL: AtomicCmpSwapRes8
+
+; ALL:           addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:           and     $[[R2:[0-9]+]], $4, $[[R1]]
+; ALL:           andi    $[[R3:[0-9]+]], $4, 3
+; CHECK-EL:      sll     $[[R5:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      xori    $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      sll     $[[R5:[0-9]+]], $[[R4]], 3
+; ALL:           ori     $[[R6:[0-9]+]], $zero, 255
+; ALL:           sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:           andi    $[[R9:[0-9]+]], $5, 255
+; ALL:           sllv    $[[R10:[0-9]+]], $[[R9]], $[[R5]]
+; ALL:           andi    $[[R11:[0-9]+]], $6, 255
+; ALL:           sllv    $[[R12:[0-9]+]], $[[R11]], $[[R5]]
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R13:[0-9]+]], 0($[[R2]])
+; ALL:           and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
+; ALL:           bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+
+; ALL:           and     $[[R15:[0-9]+]], $[[R13]], $[[R8]]
+; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R12]]
+; ALL:           sc      $[[R16]], 0($[[R2]])
+; ALL:           beqz    $[[R16]], $[[BB0]]
+
+; ALL:       $[[BB1]]:
+; ALL:           srlv    $[[R17:[0-9]+]], $[[R14]], $[[R5]]
+
+; NO-SEB-SEH:    sll     $[[R18:[0-9]+]], $[[R17]], 24
+; NO-SEB-SEH:    sra     $[[R19:[0-9]+]], $[[R18]], 24
+
+; HAS-SEB-SEH:   seb     $[[R19:[0-9]+]], $[[R17]]
+
+; ALL:           xor     $[[R20:[0-9]+]], $[[R19]], $5
+; ALL:           sltiu   $2, $[[R20]], 1
+}
+
 ; Check one i16 so that we cover the seh sign extend
 @z = common global i16 0, align 1
 
@@ -337,7 +381,7 @@ entry:
 
 @countsint = common global i32 0, align 4
 
-define i32 @CheckSync(i32 %v) nounwind noinline {
+define i32 @CheckSync(i32 signext %v) nounwind noinline {
 entry:
   %0 = atomicrmw add i32* @countsint, i32 %v seq_cst
   ret i32 %0 
@@ -371,7 +415,7 @@ entry:
 
 ; Check that MIPS32R6 has the correct offset range.
 ; FIXME: At the moment, we don't seem to do addr+offset for any atomic load/store.
-define i32 @AtomicLoadAdd32_OffGt9Bit(i32 %incr) nounwind {
+define i32 @AtomicLoadAdd32_OffGt9Bit(i32 signext %incr) nounwind {
 entry:
   %0 = atomicrmw add i32* getelementptr(i32* @x, i32 256), i32 %incr monotonic
   ret i32 %0
diff --git a/test/CodeGen/Mips/bswap.ll b/test/CodeGen/Mips/bswap.ll
index 812eef1..f182e65 100644
--- a/test/CodeGen/Mips/bswap.ll
+++ b/test/CodeGen/Mips/bswap.ll
@@ -2,7 +2,7 @@
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=MIPS64
 ; RUN: llc  < %s -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 | FileCheck %s -check-prefix=MIPS16
 
-define i32 @bswap32(i32 %x) nounwind readnone {
+define i32 @bswap32(i32 signext %x) nounwind readnone {
 entry:
 ; MIPS32-LABEL: bswap32:
 ; MIPS32: wsbh $[[R0:[0-9]+]]
@@ -29,7 +29,7 @@ entry:
   ret i32 %or.3
 }
 
-define i64 @bswap64(i64 %x) nounwind readnone {
+define i64 @bswap64(i64 signext %x) nounwind readnone {
 entry:
 ; MIPS32-LABEL: bswap64:
 ; MIPS32: wsbh $[[R0:[0-9]+]]
@@ -72,24 +72,24 @@ entry:
 define <4 x i32> @bswapv4i32(<4 x i32> %x) nounwind readnone {
 entry:
 ; MIPS32-LABEL: bswapv4i32:
-; MIPS32: wsbh $[[R0:[0-9]+]]
-; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
-; MIPS32: wsbh $[[R0:[0-9]+]]
-; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
-; MIPS32: wsbh $[[R0:[0-9]+]]
-; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
-; MIPS32: wsbh $[[R0:[0-9]+]]
-; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS32-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS32-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS32-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS32-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS32-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS32-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS32-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS32-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
 
 ; MIPS64-LABEL: bswapv4i32:
-; MIPS64: wsbh $[[R0:[0-9]+]]
-; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
-; MIPS64: wsbh $[[R0:[0-9]+]]
-; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
-; MIPS64: wsbh $[[R0:[0-9]+]]
-; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
-; MIPS64: wsbh $[[R0:[0-9]+]]
-; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS64-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS64-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS64-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS64-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS64-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS64-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS64-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS64-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
 
 ; Don't bother with a MIPS16 version. It's just bswap32 repeated four times and
 ; would be very long
diff --git a/test/CodeGen/Mips/buildpairextractelementf64.ll b/test/CodeGen/Mips/buildpairextractelementf64.ll
index 88d1d07..7682a98 100644
--- a/test/CodeGen/Mips/buildpairextractelementf64.ll
+++ b/test/CodeGen/Mips/buildpairextractelementf64.ll
@@ -1,15 +1,19 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=FP32 -check-prefix=CHECK
-; RUN: llc -march=mips  < %s | FileCheck %s -check-prefix=FP32 -check-prefix=CHECK
-; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
-; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=NO-MFHC1 -check-prefix=ALL
+; RUN: llc -march=mips  < %s | FileCheck %s -check-prefix=NO-MFHC1 -check-prefix=ALL
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL
+; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL
 
 @a = external global i32
 
-; CHECK-LABEL: f:
-; FP32: mtc1
-; FP32: mtc1
-; FP64-DAG: mtc1
-; FP64-DAG: mthc1
+; ALL-LABEL: f:
+
+; NO-MFHC1: mtc1
+; NO-MFHC1: mtc1
+
+; HAS-MFHC1-DAG: mtc1
+; HAS-MFHC1-DAG: mthc1
 
 define double @f(i32 %a1, double %d) nounwind {
 entry:
@@ -18,11 +22,13 @@ entry:
   ret double %add
 }
 
-; CHECK-LABEL: f3:
-; FP32: mfc1
-; FP32: mfc1
-; FP64-DAG: mfc1
-; FP64-DAG: mfhc1
+; ALL-LABEL: f3:
+
+; NO-MFHC1: mfc1
+; NO-MFHC1: mfc1
+
+; HAS-MFHC1-DAG: mfc1
+; HAS-MFHC1-DAG: mfhc1
 
 define void @f3(double %d, i32 %a1) nounwind {
 entry:
diff --git a/test/CodeGen/Mips/cconv/arguments-float.ll b/test/CodeGen/Mips/cconv/arguments-float.ll
index e2119ec..14a3baa 100644
--- a/test/CodeGen/Mips/cconv/arguments-float.ll
+++ b/test/CodeGen/Mips/cconv/arguments-float.ll
@@ -69,26 +69,26 @@ entry:
 ; O32-DAG:           sw [[R4]], 28([[R2]])
 ; NEW-DAG:           sd $6, 24([[R2]])
 
-; O32-DAG:           lw [[R3:\$[0-9]+]], 32($sp)
-; O32-DAG:           lw [[R4:\$[0-9]+]], 36($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 32($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 36($sp)
 ; O32-DAG:           sw [[R3]], 32([[R2]])
 ; O32-DAG:           sw [[R4]], 36([[R2]])
 ; NEW-DAG:           sd $7, 32([[R2]])
 
-; O32-DAG:           lw [[R3:\$[0-9]+]], 40($sp)
-; O32-DAG:           lw [[R4:\$[0-9]+]], 44($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 40($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 44($sp)
 ; O32-DAG:           sw [[R3]], 40([[R2]])
 ; O32-DAG:           sw [[R4]], 44([[R2]])
 ; NEW-DAG:           sd $8, 40([[R2]])
 
-; O32-DAG:           lw [[R3:\$[0-9]+]], 48($sp)
-; O32-DAG:           lw [[R4:\$[0-9]+]], 52($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 48($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 52($sp)
 ; O32-DAG:           sw [[R3]], 48([[R2]])
 ; O32-DAG:           sw [[R4]], 52([[R2]])
 ; NEW-DAG:           sd $9, 48([[R2]])
 
-; O32-DAG:           lw [[R3:\$[0-9]+]], 56($sp)
-; O32-DAG:           lw [[R4:\$[0-9]+]], 60($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 56($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 60($sp)
 ; O32-DAG:           sw [[R3]], 56([[R2]])
 ; O32-DAG:           sw [[R4]], 60([[R2]])
 ; NEW-DAG:           sd $10, 56([[R2]])
@@ -135,8 +135,8 @@ entry:
 ; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(floats)(
 
 ; The first four arguments are the same in O32/N32/N64.
-; The first argument isn't floating point so floating point registers are not
-; used.
+; The first argument is floating point but soft-float is enabled so floating
+; point registers are not used.
 ; MD00305 and GCC disagree on this one. MD00305 says that floats are treated
 ; as 8-byte aligned and occupy two slots on O32. GCC is treating them as 4-byte
 ; aligned and occupying one slot. We'll use GCC's definition.
@@ -195,7 +195,7 @@ entry:
 ; O32-DAG:           sw $7, 12([[R2]])
 ; NEW-DAG:           sd $5, 8([[R2]])
 
-define void @float_arg2(i8 %a, float %b) nounwind {
+define void @float_arg2(i8 signext %a, float %b) nounwind {
 entry:
         %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
         store volatile i8 %a, i8* %0
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll b/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
index aadf7d1..70ccf14 100644
--- a/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
+++ b/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
@@ -4,11 +4,11 @@
 ; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 ; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW --check-prefix=NEWBE %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW --check-prefix=NEWLE %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW --check-prefix=NEWBE %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW --check-prefix=NEWLE %s
 
 ; Test the effect of varargs on floating point types in the non-variable part
 ; of the argument list as specified by section 2 of the MIPSpro N32 Handbook.
@@ -34,6 +34,7 @@ entry:
         %b = va_arg i8** %ap, double
         %1 = getelementptr [11 x double]* @doubles, i32 0, i32 2
         store volatile double %b, double* %1
+        call void @llvm.va_end(i8* %ap2)
         ret void
 }
 
@@ -98,6 +99,7 @@ entry:
         %b = va_arg i8** %ap, float
         %1 = getelementptr [11 x float]* @floats, i32 0, i32 2
         store volatile float %b, float* %1
+        call void @llvm.va_end(i8* %ap2)
         ret void
 }
 
@@ -140,16 +142,18 @@ entry:
 ; Increment the pointer then get the varargs arg
 ; LLVM will rebind the load to the stack pointer instead of the varargs pointer
 ; during lowering. This is fine and doesn't change the behaviour.
-; N32/N64 is using ori instead of addiu/daddiu but (although odd) this is fine
-; since the stack is always aligned.
+; Also, in big-endian mode the offset must be increased by 4 to retrieve the
+; correct half of the argument slot.
+;
 ; O32-DAG:           addiu [[VAPTR]], [[VAPTR]], 4
 ; O32-DAG:           sw [[VAPTR]], 4($sp)
-; N32-DAG:           ori [[VAPTR]], [[VAPTR]], 4
+; N32-DAG:           addiu [[VAPTR]], [[VAPTR]], 8
 ; N32-DAG:           sw [[VAPTR]], 4($sp)
-; N64-DAG:           ori [[VAPTR]], [[VAPTR]], 4
+; N64-DAG:           daddiu [[VAPTR]], [[VAPTR]], 8
 ; N64-DAG:           sd [[VAPTR]], 0($sp)
 ; O32-DAG:           lwc1 [[FTMP1:\$f[0-9]+]], 12($sp)
-; NEW-DAG:           lwc1 [[FTMP1:\$f[0-9]+]], 8($sp)
+; NEWLE-DAG:         lwc1 [[FTMP1:\$f[0-9]+]], 8($sp)
+; NEWBE-DAG:         lwc1 [[FTMP1:\$f[0-9]+]], 12($sp)
 ; ALL-DAG:           swc1 [[FTMP1]], 8([[R2]])
 
 declare void @llvm.va_start(i8*)
diff --git a/test/CodeGen/Mips/cconv/arguments-varargs.ll b/test/CodeGen/Mips/cconv/arguments-varargs.ll
new file mode 100644
index 0000000..adacda5
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-varargs.ll
@@ -0,0 +1,1104 @@
+; RUN: llc -mtriple=mips-linux -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-BE %s
+; RUN: llc -mtriple=mipsel-linux -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-LE %s
+
+; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -mtriple=mips64-linux -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N32 --check-prefix=NEW-BE %s
+; RUN: llc -mtriple=mips64el-linux -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N32 --check-prefix=NEW-LE %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N64 --check-prefix=NEW-BE %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N64 --check-prefix=NEW-LE %s
+
+@hwords = global [3 x i16] zeroinitializer, align 1
+@words  = global [3 x i32] zeroinitializer, align 1
+@dwords = global [3 x i64] zeroinitializer, align 1
+
+define void @fn_i16_dotdotdot_i16(i16 %a, ...) {
+entry:
+; ALL-LABEL: fn_i16_dotdotdot_i16:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+; O32-DAG:       sw $5, 12([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 12 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
+; fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 12
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]]
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-BE-DAG:    lw [[ARG1:\$[0-9]+]], 4([[VA]])
+
+; Copy the arg to the global
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
+
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(hwords)(
+
+; ALL-DAG:       sh [[ARG1]], 2([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-BE-DAG:    lw [[ARG2:\$[0-9]+]], 4([[VA2]])
+
+; Copy the arg to the global
+; ALL-DAG:       sh [[ARG2]], 4([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i16
+  %e1 = getelementptr [3 x i16]* @hwords, i32 0, i32 1
+  store volatile i16 %arg1, i16* %e1, align 2
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i16
+  %e2 = getelementptr [3 x i16]* @hwords, i32 0, i32 2
+  store volatile i16 %arg2, i16* %e2, align 2
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i16_dotdotdot_i32(i16 %a, ...) {
+entry:
+; ALL-LABEL: fn_i16_dotdotdot_i32:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+; O32-DAG:       sw $5, 12([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 12 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
+; fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 12
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]]
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-BE-DAG:    lw [[ARG1:\$[0-9]+]], 4([[VA]])
+
+; Copy the arg to the global
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
+
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(words)(
+
+; ALL-DAG:       sw [[ARG1]], 4([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-BE-DAG:    lw [[ARG2:\$[0-9]+]], 4([[VA2]])
+
+; Copy the arg to the global
+; ALL-DAG:       sw [[ARG2]], 8([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i32
+  %e1 = getelementptr [3 x i32]* @words, i32 0, i32 1
+  store volatile i32 %arg1, i32* %e1, align 4
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i32
+  %e2 = getelementptr [3 x i32]* @words, i32 0, i32 2
+  store volatile i32 %arg2, i32* %e2, align 4
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i16_dotdotdot_i64(i16 %a, ...) {
+entry:
+; ALL-LABEL: fn_i16_dotdotdot_i64:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+; O32-DAG:       sw $5, 12([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 12 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
+; fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 12
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]] (and realign pointer for O32)
+; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
+; O32-DAG:       addiu [[VA_TMP1:\$[0-9]+]], $zero, -8
+; O32-DAG:       and   [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
+; O32-DAG:       ori   [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion and copy it to the global.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG1]], 8([[GV]])
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG1]], 12([[GV]])
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(dwords)(
+; NEW-DAG:       ld [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-DAG:       sd [[ARG1]], 8([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
+; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
+; O32-DAG:       and   [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
+; O32-DAG:       ori   [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion and copy it to the global.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG2]], 16([[GV]])
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG2]], 20([[GV]])
+
+; NEW-DAG:       ld [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-DAG:       sd [[ARG2]], 16([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i64
+  %e1 = getelementptr [3 x i64]* @dwords, i32 0, i32 1
+  store volatile i64 %arg1, i64* %e1, align 8
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i64
+  %e2 = getelementptr [3 x i64]* @dwords, i32 0, i32 2
+  store volatile i64 %arg2, i64* %e2, align 8
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i32_dotdotdot_i16(i32 %a, ...) {
+entry:
+; ALL-LABEL: fn_i32_dotdotdot_i16:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+; O32-DAG:       sw $5, 12([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 12 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
+; fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 12
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]]
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-BE-DAG:    lw [[ARG1:\$[0-9]+]], 4([[VA]])
+
+; Copy the arg to the global
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
+
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(hwords)(
+
+; ALL-DAG:       sh [[ARG1]], 2([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-BE-DAG:    lw [[ARG2:\$[0-9]+]], 4([[VA2]])
+
+; Copy the arg to the global
+; ALL-DAG:       sh [[ARG2]], 4([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i16
+  %e1 = getelementptr [3 x i16]* @hwords, i32 0, i32 1
+  store volatile i16 %arg1, i16* %e1, align 2
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i16
+  %e2 = getelementptr [3 x i16]* @hwords, i32 0, i32 2
+  store volatile i16 %arg2, i16* %e2, align 2
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i32_dotdotdot_i32(i32 %a, ...) {
+entry:
+; ALL-LABEL: fn_i32_dotdotdot_i32:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+; O32-DAG:       sw $5, 12([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 12 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
+; fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 12
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]]
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-BE-DAG:    lw [[ARG1:\$[0-9]+]], 4([[VA]])
+
+; Copy the arg to the global
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
+
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(words)(
+
+; ALL-DAG:       sw [[ARG1]], 4([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-BE-DAG:    lw [[ARG2:\$[0-9]+]], 4([[VA2]])
+
+; Copy the arg to the global
+; ALL-DAG:       sw [[ARG2]], 8([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i32
+  %e1 = getelementptr [3 x i32]* @words, i32 0, i32 1
+  store volatile i32 %arg1, i32* %e1, align 4
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i32
+  %e2 = getelementptr [3 x i32]* @words, i32 0, i32 2
+  store volatile i32 %arg2, i32* %e2, align 4
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i32_dotdotdot_i64(i32 %a, ...) {
+entry:
+; ALL-LABEL: fn_i32_dotdotdot_i64:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+; O32-DAG:       sw $5, 12([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 12 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
+; fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 12
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]] (and realign pointer for O32)
+; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
+; O32-DAG:       addiu [[VA_TMP1:\$[0-9]+]], $zero, -8
+; O32-DAG:       and   [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
+; O32-DAG:       ori   [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion and copy it to the global.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG1]], 8([[GV]])
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG1]], 12([[GV]])
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(dwords)(
+; NEW-DAG:       ld [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-DAG:       sd [[ARG1]], 8([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
+; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
+; O32-DAG:       and   [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
+; O32-DAG:       ori   [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion and copy it to the global.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG2]], 16([[GV]])
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG2]], 20([[GV]])
+
+; NEW-DAG:       ld [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-DAG:       sd [[ARG2]], 16([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i64
+  %e1 = getelementptr [3 x i64]* @dwords, i32 0, i32 1
+  store volatile i64 %arg1, i64* %e1, align 8
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i64
+  %e2 = getelementptr [3 x i64]* @dwords, i32 0, i32 2
+  store volatile i64 %arg2, i64* %e2, align 8
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i64_dotdotdot_i16(i64 %a, ...) {
+entry:
+; ALL-LABEL: fn_i64_dotdotdot_i16:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 16 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the two 4 byte slots for the
+; first fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 16
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]]
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-BE-DAG:    lw [[ARG1:\$[0-9]+]], 4([[VA]])
+
+; Copy the arg to the global
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
+
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(hwords)(
+
+; ALL-DAG:       sh [[ARG1]], 2([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-BE-DAG:    lw [[ARG2:\$[0-9]+]], 4([[VA2]])
+
+; Copy the arg to the global
+; ALL-DAG:       sh [[ARG2]], 4([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i16
+  %e1 = getelementptr [3 x i16]* @hwords, i32 0, i32 1
+  store volatile i16 %arg1, i16* %e1, align 2
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i16
+  %e2 = getelementptr [3 x i16]* @hwords, i32 0, i32 2
+  store volatile i16 %arg2, i16* %e2, align 2
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i64_dotdotdot_i32(i64 %a, ...) {
+entry:
+; ALL-LABEL: fn_i64_dotdotdot_i32:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 16 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the two 4 byte slots for the
+; first fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 16
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]]
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-BE-DAG:    lw [[ARG1:\$[0-9]+]], 4([[VA]])
+
+; Copy the arg to the global
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
+
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(words)(
+
+; ALL-DAG:       sw [[ARG1]], 4([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-BE-DAG:    lw [[ARG2:\$[0-9]+]], 4([[VA2]])
+
+; Copy the arg to the global
+; ALL-DAG:       sw [[ARG2]], 8([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i32
+  %e1 = getelementptr [3 x i32]* @words, i32 0, i32 1
+  store volatile i32 %arg1, i32* %e1, align 4
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i32
+  %e2 = getelementptr [3 x i32]* @words, i32 0, i32 2
+  store volatile i32 %arg2, i32* %e2, align 4
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i64_dotdotdot_i64(i64 %a, ...) {
+entry:
+; ALL-LABEL: fn_i64_dotdotdot_i64:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 16 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the two 4 byte slots for the
+; first fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 16
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]] (and realign pointer for O32)
+; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
+; O32-DAG:       addiu [[VA_TMP1:\$[0-9]+]], $zero, -8
+; O32-DAG:       and   [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
+; O32-DAG:       ori   [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion and copy it to the global.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG1]], 8([[GV]])
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG1]], 12([[GV]])
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(dwords)(
+; NEW-DAG:       ld [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-DAG:       sd [[ARG1]], 8([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
+; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
+; O32-DAG:       and   [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
+; O32-DAG:       ori   [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion and copy it to the global.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG2]], 16([[GV]])
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG2]], 20([[GV]])
+
+; NEW-DAG:       ld [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-DAG:       sd [[ARG2]], 16([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i64
+  %e1 = getelementptr [3 x i64]* @dwords, i32 0, i32 1
+  store volatile i64 %arg1, i64* %e1, align 8
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i64
+  %e2 = getelementptr [3 x i64]* @dwords, i32 0, i32 2
+  store volatile i64 %arg2, i64* %e2, align 8
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)
diff --git a/test/CodeGen/Mips/cconv/arguments.ll b/test/CodeGen/Mips/cconv/arguments.ll
index 8fe29f3..43da604 100644
--- a/test/CodeGen/Mips/cconv/arguments.ll
+++ b/test/CodeGen/Mips/cconv/arguments.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
-; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 
 ; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 ; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
@@ -23,8 +23,10 @@
 @floats = global [11 x float] zeroinitializer
 @doubles = global [11 x double] zeroinitializer
 
-define void @align_to_arg_slots(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g,
-                                i8 %h, i8 %i, i8 %j) nounwind {
+define void @align_to_arg_slots(i8 signext %a, i8 signext %b, i8 signext %c,
+                                i8 signext %d, i8 signext %e, i8 signext %f,
+                                i8 signext %g, i8 signext %h, i8 signext %i,
+                                i8 signext %j) nounwind {
 entry:
         %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
         store volatile i8 %a, i8* %0
@@ -53,7 +55,7 @@ entry:
 ; We won't test the way the global address is calculated in this test. This is
 ; just to get the register number for the other checks.
 ; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
-; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM64-DAG:           ld [[R1:\$[0-9]+]], %got_disp(bytes)(
 
 ; The first four arguments are the same in O32/N32/N64
 ; ALL-DAG:           sb $4, 1([[R1]])
@@ -82,15 +84,16 @@ entry:
 ; increase by 4 for O32 and 8 for N32/N64.
 ; O32-DAG:           lw [[R3:\$[0-9]+]], 32($sp)
 ; O32-DAG:           sb [[R3]], 9([[R1]])
-; NEW-DAG:           lw [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           ld [[R3:\$[0-9]+]], 0($sp)
 ; NEW-DAG:           sb [[R3]], 9([[R1]])
 ; O32-DAG:           lw [[R3:\$[0-9]+]], 36($sp)
 ; O32-DAG:           sb [[R3]], 10([[R1]])
-; NEW-DAG:           lw [[R3:\$[0-9]+]], 8($sp)
+; NEW-DAG:           ld [[R3:\$[0-9]+]], 8($sp)
 ; NEW-DAG:           sb [[R3]], 10([[R1]])
 
-define void @slot_skipping(i8 %a, i64 %b, i8 %c, i8 %d,
-                           i8 %e, i8 %f, i8 %g, i64 %i, i8 %j) nounwind {
+define void @slot_skipping(i8 signext %a, i64 signext %b, i8 signext %c,
+                           i8 signext %d, i8 signext %e, i8 signext %f,
+                           i8 signext %g, i64 signext %i, i8 signext %j) nounwind {
 entry:
         %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
         store volatile i8 %a, i8* %0
@@ -117,9 +120,9 @@ entry:
 ; We won't test the way the global address is calculated in this test. This is
 ; just to get the register number for the other checks.
 ; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
-; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM64-DAG:           ld [[R1:\$[0-9]+]], %got_disp(bytes)(
 ; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
-; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(dwords)(
+; SYM64-DAG:           ld [[R2:\$[0-9]+]], %got_disp(dwords)(
 
 ; The first argument is the same in O32/N32/N64.
 ; ALL-DAG:           sb $4, 1([[R1]])
@@ -137,8 +140,7 @@ entry:
 ; It's not clear why O32 uses lbu for this argument, but it's not wrong so we'll
 ; accept it for now. The only IR difference is that this argument has
 ; anyext from i8 and align 8 on it.
-; O32LE-DAG:           lbu [[R3:\$[0-9]+]], 16($sp)
-; O32BE-DAG:           lbu [[R3:\$[0-9]+]], 19($sp)
+; O32-DAG:           lw [[R3:\$[0-9]+]], 16($sp)
 ; O32-DAG:           sb [[R3]], 2([[R1]])
 ; NEW-DAG:           sb $6, 2([[R1]])
 ; O32-DAG:           lw [[R3:\$[0-9]+]], 20($sp)
@@ -166,5 +168,5 @@ entry:
 ; increase by 4 for O32 and 8 for N32/N64.
 ; O32-DAG:           lw [[R3:\$[0-9]+]], 48($sp)
 ; O32-DAG:           sb [[R3]], 7([[R1]])
-; NEW-DAG:           lw [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           ld [[R3:\$[0-9]+]], 0($sp)
 ; NEW-DAG:           sb [[R3]], 7([[R1]])
diff --git a/test/CodeGen/Mips/cconv/return-float.ll b/test/CodeGen/Mips/cconv/return-float.ll
index 28cf83d..d1a5e4f 100644
--- a/test/CodeGen/Mips/cconv/return-float.ll
+++ b/test/CodeGen/Mips/cconv/return-float.ll
@@ -30,7 +30,7 @@ entry:
 ; O32-DAG:           lw $2, %lo(float)([[R1]])
 ; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
 ; N32-DAG:           lw $2, %lo(float)([[R1]])
-; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(float)($1)
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(float)(
 ; N64-DAG:           lw $2, 0([[R1]])
 
 define double @retdouble() nounwind {
@@ -44,5 +44,5 @@ entry:
 ; O32-DAG:           addiu [[R2:\$[0-9]+]], [[R1]], %lo(double)
 ; O32-DAG:           lw $3, 4([[R2]])
 ; N32-DAG:           ld $2, %lo(double)([[R1:\$[0-9]+]])
-; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(double)($1)
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(double)(
 ; N64-DAG:           ld $2, 0([[R1]])
diff --git a/test/CodeGen/Mips/cconv/return-hard-float.ll b/test/CodeGen/Mips/cconv/return-hard-float.ll
index 371b3a5..123b499 100644
--- a/test/CodeGen/Mips/cconv/return-hard-float.ll
+++ b/test/CodeGen/Mips/cconv/return-hard-float.ll
@@ -10,6 +10,9 @@
 ; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
 ; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
 
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=ALL --check-prefix=032FP64 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=ALL --check-prefix=032FP64 %s
+
 ; Test the float returns for all ABI's and byte orders as specified by
 ; section 5 of MD00305 (MIPS ABIs Described).
 
@@ -30,7 +33,7 @@ entry:
 ; O32-DAG:           lwc1 $f0, %lo(float)([[R1]])
 ; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
 ; N32-DAG:           lwc1 $f0, %lo(float)([[R1]])
-; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(float)($1)
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(float)(
 ; N64-DAG:           lwc1 $f0, 0([[R1]])
 
 define double @retdouble() nounwind {
@@ -42,5 +45,15 @@ entry:
 ; ALL-LABEL: retdouble:
 ; O32-DAG:           ldc1 $f0, %lo(double)([[R1:\$[0-9]+]])
 ; N32-DAG:           ldc1 $f0, %lo(double)([[R1:\$[0-9]+]])
-; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(double)($1)
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(double)(
 ; N64-DAG:           ldc1 $f0, 0([[R1]])
+
+define { double, double } @retComplexDouble() #0 {
+  %retval = alloca { double, double }, align 8
+  %1 = load { double, double }* %retval
+  ret { double, double } %1
+}
+
+; ALL-LABEL: retComplexDouble:
+; 032FP64-DAG:      ldc1     $f0, 0($sp)
+; 032FP64-DAG:      ldc1     $f2, 8($sp)
diff --git a/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll b/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll
new file mode 100644
index 0000000..2e84477
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test return of {fp128} agrees with de-facto N32/N64 ABI.
+
+@struct_fp128 = global {fp128} zeroinitializer
+
+define inreg {fp128} @ret_struct_fp128() nounwind {
+entry:
+        %0 = load volatile {fp128}* @struct_fp128
+        ret {fp128} %0
+}
+
+; ALL-LABEL: ret_struct_fp128:
+
+; O32 generates different IR so we don't test it here. It returns the struct
+; indirectly.
+
+; Contrary to the N32/N64 ABI documentation, a struct containing a long double
+; is returned in $f0, and $f1 instead of the usual $f0, and $f2. This is to
+; match the de facto ABI as implemented by GCC.
+; N32-DAG:        lui [[R1:\$[0-9]+]], %hi(struct_fp128)
+; N32-DAG:        ld  [[R2:\$[0-9]+]], %lo(struct_fp128)([[R1]])
+; N32-DAG:        dmtc1 [[R2]], $f0
+; N32-DAG:        addiu [[R3:\$[0-9]+]], [[R1]], %lo(struct_fp128)
+; N32-DAG:        ld  [[R4:\$[0-9]+]], 8([[R3]])
+; N32-DAG:        dmtc1 [[R4]], $f1
+
+; N64-DAG:        ld  [[R1:\$[0-9]+]], %got_disp(struct_fp128)($1)
+; N64-DAG:        ld  [[R2:\$[0-9]+]], 0([[R1]])
+; N64-DAG:        dmtc1 [[R2]], $f0
+; N64-DAG:        ld  [[R4:\$[0-9]+]], 8([[R1]])
+; N64-DAG:        dmtc1 [[R4]], $f1
diff --git a/test/CodeGen/Mips/cconv/return-struct.ll b/test/CodeGen/Mips/cconv/return-struct.ll
new file mode 100644
index 0000000..11a8cf0
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/return-struct.ll
@@ -0,0 +1,232 @@
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-BE %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-LE %s
+
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 --check-prefix=N32-BE %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 --check-prefix=N32-LE %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 --check-prefix=N64-BE %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 --check-prefix=N64-LE %s
+
+; Test struct returns for all ABI's and byte orders.
+
+@struct_byte = global {i8} zeroinitializer
+@struct_2byte = global {i8,i8} zeroinitializer
+@struct_3xi16 = global {[3 x i16]} zeroinitializer
+@struct_6xi32 = global {[6 x i32]} zeroinitializer
+@struct_128xi16 = global {[128 x i16]} zeroinitializer
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
+
+define inreg {i8} @ret_struct_i8() nounwind {
+entry:
+        %0 = load volatile {i8}* @struct_byte
+        ret {i8} %0
+}
+
+; ALL-LABEL: ret_struct_i8:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(struct_byte)
+; O32-DAG:           lbu $2, %lo(struct_byte)([[R1]])
+
+; N32-LE-DAG:        lui [[R1:\$[0-9]+]], %hi(struct_byte)
+; N32-LE-DAG:        lb $2, %lo(struct_byte)([[R1]])
+
+; N32-BE-DAG:        lui [[R1:\$[0-9]+]], %hi(struct_byte)
+; N32-BE-DAG:        lb [[R2:\$[0-9]+]], %lo(struct_byte)([[R1]])
+; N32-BE-DAG:        dsll $2, [[R2]], 56
+
+; N64-LE-DAG:        ld  [[R1:\$[0-9]+]], %got_disp(struct_byte)($1)
+; N64-LE-DAG:        lb $2, 0([[R1]])
+
+; N64-BE-DAG:        ld  [[R1:\$[0-9]+]], %got_disp(struct_byte)($1)
+; N64-BE-DAG:        lb [[R2:\$[0-9]+]], 0([[R1]])
+; N64-BE-DAG:        dsll $2, [[R2]], 56
+
+; This test is based on the way clang currently lowers {i8,i8} to {i16}.
+; FIXME: It should probably work for without any lowering too but this doesn't
+;        work as expected. Each member gets mapped to a register rather than
+;        packed into a single register.
+define inreg {i16} @ret_struct_i16() nounwind {
+entry:
+        %retval = alloca {i8,i8}, align 1
+        %0 = bitcast {i8,i8}* %retval to i8*
+        call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds ({i8,i8}* @struct_2byte, i32 0, i32 0), i64 2, i32 1, i1 false)
+        %1 = bitcast {i8,i8}* %retval to {i16}*
+        %2 = load volatile {i16}* %1
+        ret {i16} %2
+}
+
+; ALL-LABEL: ret_struct_i16:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(struct_2byte)
+; O32-DAG:           lhu [[R2:\$[0-9]+]], %lo(struct_2byte)([[R1]])
+; O32-DAG:           sh  [[R2]], 0([[SP:\$sp]])
+; O32-DAG:           lhu $2, 0([[SP:\$sp]])
+
+; N32-LE-DAG:        lui [[R1:\$[0-9]+]], %hi(struct_2byte)
+; N32-LE-DAG:        lhu [[R2:\$[0-9]+]], %lo(struct_2byte)([[R1]])
+; N32-LE-DAG:        sh  [[R2]], 8([[SP:\$sp]])
+; N32-LE-DAG:        lh  $2, 8([[SP:\$sp]])
+
+; N32-BE-DAG:        lui [[R1:\$[0-9]+]], %hi(struct_2byte)
+; N32-BE-DAG:        lhu [[R2:\$[0-9]+]], %lo(struct_2byte)([[R1]])
+; N32-BE-DAG:        sh  [[R2]], 8([[SP:\$sp]])
+; N32-BE-DAG:        lh  [[R3:\$[0-9]+]], 8([[SP:\$sp]])
+; N32-BE-DAG:        dsll $2, [[R3]], 48
+
+; N64-LE-DAG:        ld  [[R1:\$[0-9]+]], %got_disp(struct_2byte)($1)
+; N64-LE-DAG:        lhu [[R2:\$[0-9]+]], 0([[R1]])
+; N64-LE-DAG:        sh  [[R2]], 8([[SP:\$sp]])
+; N64-LE-DAG:        lh  $2, 8([[SP:\$sp]])
+
+; N64-BE-DAG:        ld  [[R1:\$[0-9]+]], %got_disp(struct_2byte)($1)
+; N64-BE-DAG:        lhu [[R2:\$[0-9]+]], 0([[R1]])
+; N64-BE-DAG:        sh  [[R2]], 8([[SP:\$sp]])
+; N64-BE-DAG:        lh  [[R3:\$[0-9]+]], 8([[SP:\$sp]])
+; N64-BE-DAG:        dsll $2, [[R3]], 48
+
+; Ensure that structures bigger than 32-bits but smaller than 64-bits are
+; also returned in the upper bits on big endian targets. Previously, these were
+; missed by the CCPromoteToType and the shift didn't happen.
+define inreg {i48} @ret_struct_3xi16() nounwind {
+entry:
+        %0 = load volatile i48* bitcast ({[3 x i16]}* @struct_3xi16 to i48*), align 2
+        %1 = insertvalue {i48} undef, i48 %0, 0
+        ret {i48} %1
+}
+
+; ALL-LABEL: ret_struct_3xi16:
+
+; O32-BE-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_3xi16)
+; O32-BE-DAG:        addiu [[PTR_LO:\$[0-9]+]], [[PTR_HI]], %lo(struct_3xi16)
+; O32-BE-DAG:        lhu [[R1:\$[0-9]+]], 4([[PTR_LO]])
+; O32-BE-DAG:        lw [[R2:\$[0-9]+]], %lo(struct_3xi16)([[PTR_HI]])
+; O32-BE-DAG:        sll [[R3:\$[0-9]+]], [[R2]], 16
+; O32-BE-DAG:        or  $3, [[R1]], [[R3]]
+; O32-BE-DAG:        srl $2, [[R2]], 16
+
+; O32-LE-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_3xi16)
+; O32-LE-DAG:        addiu [[PTR_LO:\$[0-9]+]], [[PTR_HI]], %lo(struct_3xi16)
+; O32-LE-DAG:        lhu $3, 4([[PTR_LO]])
+; O32-LE-DAG:        lw $2, %lo(struct_3xi16)([[PTR_HI]])
+
+; N32-LE-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_3xi16)
+; N32-LE-DAG:        addiu [[PTR_LO:\$[0-9]+]], [[PTR_HI]], %lo(struct_3xi16)
+; N32-LE-DAG:        lh [[R1:\$[0-9]+]], 4([[PTR_LO]])
+; N32-LE-DAG:        lwu [[R2:\$[0-9]+]], %lo(struct_3xi16)([[PTR_HI]])
+; N32-LE-DAG:        dsll [[R3:\$[0-9]+]], [[R1]], 32
+; N32-LE-DAG:        or $2, [[R2]], [[R3]]
+
+; N32-BE-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_3xi16)
+; N32-BE-DAG:        addiu [[PTR_LO:\$[0-9]+]], [[PTR_HI]], %lo(struct_3xi16)
+; N32-BE-DAG:        lw [[R1:\$[0-9]+]], %lo(struct_3xi16)([[PTR_HI]])
+; N32-BE-DAG:        dsll [[R2:\$[0-9]+]], [[R1]], 16
+; N32-BE-DAG:        lhu [[R3:\$[0-9]+]], 4([[PTR_LO]])
+; N32-BE-DAG:        or [[R4:\$[0-9]+]], [[R3]], [[R2]]
+; N32-BE-DAG:        dsll $2, [[R4]], 16
+
+; N64-LE-DAG:        ld  [[PTR:\$[0-9]+]], %got_disp(struct_3xi16)($1)
+; N64-LE-DAG:        lh [[R1:\$[0-9]+]], 4([[PTR]])
+; N64-LE-DAG:        lwu [[R2:\$[0-9]+]], 0([[PTR]])
+; N64-LE-DAG:        dsll [[R3:\$[0-9]+]], [[R1]], 32
+; N64-LE-DAG:        or $2, [[R2]], [[R3]]
+
+; N64-BE-DAG:        ld  [[PTR:\$[0-9]+]], %got_disp(struct_3xi16)($1)
+; N64-BE-DAG:        lw [[R1:\$[0-9]+]], 0([[PTR]])
+; N64-BE-DAG:        dsll [[R2:\$[0-9]+]], [[R1]], 16
+; N64-BE-DAG:        lhu [[R3:\$[0-9]+]], 4([[PTR]])
+; N64-BE-DAG:        or [[R4:\$[0-9]+]], [[R3]], [[R2]]
+; N32-BE-DAG:        dsll $2, [[R4]], 16
+
+; Ensure that large structures (>128-bit) are returned indirectly.
+; We pick an extremely large structure so we don't have to match inlined memcpy's.
+define void @ret_struct_128xi16({[128 x i16]}* sret %returnval) {
+entry:
+        %0 = bitcast {[128 x i16]}* %returnval to i8*
+        call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ({[128 x i16]}* @struct_128xi16 to i8*), i64 256, i32 2, i1 false)
+        ret void
+}
+
+; ALL-LABEL: ret_struct_128xi16:
+
+; sret pointer is already in $4
+; O32-DAG:        lui [[PTR:\$[0-9]+]], %hi(struct_128xi16)
+; O32-DAG:        addiu $5, [[PTR]], %lo(struct_128xi16)
+; O32:            jal memcpy
+
+; sret pointer is already in $4
+; N32-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_128xi16)
+; N32-DAG:        addiu [[PTR:\$[0-9]+]], [[PTR_HI]], %lo(struct_128xi16)
+; FIXME: This signext isn't necessary. Like integers, pointers are
+;        but unlike integers, pointers cannot have the signext attribute.
+; N32-DAG:        sll $5, [[PTR]], 0
+; N32:            jal memcpy
+
+; sret pointer is already in $4
+; N64-DAG:        ld $5, %got_disp(struct_128xi16)(
+; N64-DAG:        ld $25, %call16(memcpy)(
+; N64:            jalr $25
+
+; Ensure that large structures (>128-bit) are returned indirectly.
+; This will generate inlined memcpy's anyway so pick the smallest large
+; structure
+; This time we let the backend lower the sret argument.
+define {[6 x i32]} @ret_struct_6xi32() {
+entry:
+        %0 = load volatile {[6 x i32]}* @struct_6xi32, align 2
+        ret {[6 x i32]} %0
+}
+
+; ALL-LABEL: ret_struct_6xi32:
+
+; sret pointer is already in $4
+; O32-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_6xi32)
+; O32-DAG:        addiu [[PTR:\$[0-9]+]], [[PTR_HI]], %lo(struct_6xi32)
+; O32-DAG:        lw [[T0:\$[0-9]+]], %lo(struct_6xi32)([[PTR]])
+; O32-DAG:        lw [[T1:\$[0-9]+]], 4([[PTR]])
+; O32-DAG:        lw [[T2:\$[0-9]+]], 8([[PTR]])
+; O32-DAG:        lw [[T3:\$[0-9]+]], 12([[PTR]])
+; O32-DAG:        lw [[T4:\$[0-9]+]], 16([[PTR]])
+; O32-DAG:        lw [[T5:\$[0-9]+]], 20([[PTR]])
+; O32-DAG:        sw [[T0]], 0($4)
+; O32-DAG:        sw [[T1]], 4($4)
+; O32-DAG:        sw [[T2]], 8($4)
+; O32-DAG:        sw [[T3]], 12($4)
+; O32-DAG:        sw [[T4]], 16($4)
+; O32-DAG:        sw [[T5]], 20($4)
+
+; FIXME: This signext isn't necessary. Like integers, pointers are
+;        but unlike integers, pointers cannot have the signext attribute.
+;        In this case we don't have anywhere to put the signext either since
+;        the sret argument is invented by the backend.
+; N32-DAG:        sll [[RET_PTR:\$[0-9]+]], $4, 0
+; N32-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_6xi32)
+; N32-DAG:        addiu [[PTR:\$[0-9]+]], [[PTR_HI]], %lo(struct_6xi32)
+; N32-DAG:        lw [[T0:\$[0-9]+]], %lo(struct_6xi32)([[PTR]])
+; N32-DAG:        lw [[T1:\$[0-9]+]], 4([[PTR]])
+; N32-DAG:        lw [[T2:\$[0-9]+]], 8([[PTR]])
+; N32-DAG:        lw [[T3:\$[0-9]+]], 12([[PTR]])
+; N32-DAG:        lw [[T4:\$[0-9]+]], 16([[PTR]])
+; N32-DAG:        lw [[T5:\$[0-9]+]], 20([[PTR]])
+; N32-DAG:        sw [[T0]], 0([[RET_PTR]])
+; N32-DAG:        sw [[T1]], 4([[RET_PTR]])
+; N32-DAG:        sw [[T2]], 8([[RET_PTR]])
+; N32-DAG:        sw [[T3]], 12([[RET_PTR]])
+; N32-DAG:        sw [[T4]], 16([[RET_PTR]])
+; N32-DAG:        sw [[T5]], 20([[RET_PTR]])
+
+; sret pointer is already in $4
+; N64-DAG:        ld [[PTR:\$[0-9]+]], %got_disp(struct_6xi32)(
+; N64-DAG:        lw [[T0:\$[0-9]+]], 0([[PTR]])
+; N64-DAG:        lw [[T1:\$[0-9]+]], 4([[PTR]])
+; N64-DAG:        lw [[T2:\$[0-9]+]], 8([[PTR]])
+; N64-DAG:        lw [[T3:\$[0-9]+]], 12([[PTR]])
+; N64-DAG:        lw [[T4:\$[0-9]+]], 16([[PTR]])
+; N64-DAG:        lw [[T5:\$[0-9]+]], 20([[PTR]])
+; N64-DAG:        sw [[T0]], 0($4)
+; N64-DAG:        sw [[T1]], 4($4)
+; N64-DAG:        sw [[T2]], 8($4)
+; N64-DAG:        sw [[T3]], 12($4)
+; N64-DAG:        sw [[T4]], 16($4)
+; N64-DAG:        sw [[T5]], 20($4)
diff --git a/test/CodeGen/Mips/cconv/return.ll b/test/CodeGen/Mips/cconv/return.ll
index 76ce5e4..63f9b5f 100644
--- a/test/CodeGen/Mips/cconv/return.ll
+++ b/test/CodeGen/Mips/cconv/return.ll
@@ -33,7 +33,7 @@ entry:
 ; O32-DAG:           lbu $2, %lo(byte)([[R1]])
 ; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(byte)
 ; N32-DAG:           lbu $2, %lo(byte)([[R1]])
-; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(byte)($1)
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(byte)(
 ; N64-DAG:           lbu $2, 0([[R1]])
 
 define i32 @reti32() nounwind {
@@ -47,7 +47,7 @@ entry:
 ; O32-DAG:           lw $2, %lo(word)([[R1]])
 ; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(word)
 ; N32-DAG:           lw $2, %lo(word)([[R1]])
-; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(word)($1)
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(word)(
 ; N64-DAG:           lw $2, 0([[R1]])
 
 define i64 @reti64() nounwind {
diff --git a/test/CodeGen/Mips/cfi_offset.ll b/test/CodeGen/Mips/cfi_offset.ll
new file mode 100644
index 0000000..e23855b
--- /dev/null
+++ b/test/CodeGen/Mips/cfi_offset.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=mips -mattr=+o32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EB
+; RUN: llc -march=mipsel -mattr=+o32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EL
+; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EB
+; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EL
+; RUN: llc -march=mips -mattr=+o32,+fp64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EB
+; RUN: llc -march=mipsel -mattr=+o32,+fp64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EL
+
+@var = global double 0.0
+
+declare void @foo(...)
+
+define void @bar() {
+
+; CHECK-LABEL:  bar:
+
+; CHECK:  .cfi_def_cfa_offset 40
+; CHECK:  sdc1  $f22, 32($sp)
+; CHECK:  sdc1  $f20, 24($sp)
+; CHECK:  sw  $ra, 20($sp)
+; CHECK:  sw  $16, 16($sp)
+
+; CHECK-EB:  .cfi_offset 55, -8
+; CHECK-EB:  .cfi_offset 54, -4
+; CHECK-EB:  .cfi_offset 53, -16
+; CHECK-EB:  .cfi_offset 52, -12
+
+; CHECK-EL:  .cfi_offset 54, -8
+; CHECK-EL:  .cfi_offset 55, -4
+; CHECK-EL:  .cfi_offset 52, -16
+; CHECK-EL:  .cfi_offset 53, -12
+
+; CHECK:  .cfi_offset 31, -20
+; CHECK:  .cfi_offset 16, -24
+
+    %val1 = load volatile double* @var
+    %val2 = load volatile double* @var
+    call void (...)* @foo() nounwind
+    store volatile double %val1, double* @var
+    store volatile double %val2, double* @var
+    ret void
+}
diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll
index 999bdb4..b12c2df 100644
--- a/test/CodeGen/Mips/cmov.ll
+++ b/test/CodeGen/Mips/cmov.ll
@@ -38,7 +38,7 @@
 ; 64-CMP-DAG:   or $[[T2:[0-9]+]], $[[T0]], $[[T1]]
 ; 64-CMP-DAG:   ld $2, 0($[[T2]])
 
-define i32* @cmov1(i32 %s) nounwind readonly {
+define i32* @cmov1(i32 signext %s) nounwind readonly {
 entry:
   %tobool = icmp ne i32 %s, 0
   %tmp1 = load i32** @i3, align 4
@@ -78,7 +78,7 @@ entry:
 ; 64-CMP-DAG:   or $[[T2:[0-9]+]], $[[T0]], $[[T1]]
 ; 64-CMP-DAG:   lw $2, 0($[[T2]])
 
-define i32 @cmov2(i32 %s) nounwind readonly {
+define i32 @cmov2(i32 signext %s) nounwind readonly {
 entry:
   %tobool = icmp ne i32 %s, 0
   %tmp1 = load i32* @c, align 4
@@ -109,13 +109,46 @@ entry:
 ; 64-CMP-DAG:   selnez $[[T1:[0-9]+]], $6, $[[CC]]
 ; 64-CMP-DAG:   or $2, $[[T0]], $[[T1]]
 
-define i32 @cmov3(i32 %a, i32 %b, i32 %c) nounwind readnone {
+define i32 @cmov3(i32 signext %a, i32 signext %b, i32 signext %c) nounwind readnone {
 entry:
   %cmp = icmp eq i32 %a, 234
   %cond = select i1 %cmp, i32 %b, i32 %c
   ret i32 %cond
 }
 
+; ALL-LABEL: cmov3_ne:
+
+; We won't check the result register since we can't know if the move is first
+; or last. We do know it will be either one of two registers so we can at least
+; check that.
+
+; FIXME: Use xori instead of addiu+xor.
+; 32-CMOV:      addiu $[[R0:[0-9]+]], $zero, 234
+; 32-CMOV:      xor $[[R1:[0-9]+]], $4, $[[R0]]
+; 32-CMOV:      movn ${{[26]}}, $5, $[[R1]]
+
+; 32-CMP-DAG:   xori $[[CC:[0-9]+]], $4, 234
+; 32-CMP-DAG:   selnez $[[T0:[0-9]+]], $5, $[[CC]]
+; 32-CMP-DAG:   seleqz $[[T1:[0-9]+]], $6, $[[CC]]
+; 32-CMP-DAG:   or $2, $[[T0]], $[[T1]]
+
+; FIXME: Use xori instead of addiu+xor.
+; 64-CMOV:      addiu $[[R0:[0-9]+]], $zero, 234
+; 64-CMOV:      xor $[[R1:[0-9]+]], $4, $[[R0]]
+; 64-CMOV:      movn ${{[26]}}, $5, $[[R1]]
+
+; 64-CMP-DAG:   xori $[[CC:[0-9]+]], $4, 234
+; 64-CMP-DAG:   selnez $[[T0:[0-9]+]], $5, $[[CC]]
+; 64-CMP-DAG:   seleqz $[[T1:[0-9]+]], $6, $[[CC]]
+; 64-CMP-DAG:   or $2, $[[T0]], $[[T1]]
+
+define i32 @cmov3_ne(i32 signext %a, i32 signext %b, i32 signext %c) nounwind readnone {
+entry:
+  %cmp = icmp ne i32 %a, 234
+  %cond = select i1 %cmp, i32 %b, i32 %c
+  ret i32 %cond
+}
+
 ; ALL-LABEL: cmov4:
 
 ; We won't check the result register since we can't know if the move is first
@@ -146,13 +179,54 @@ entry:
 ; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $6, $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i64 @cmov4(i32 %a, i64 %b, i64 %c) nounwind readnone {
+define i64 @cmov4(i32 signext %a, i64 %b, i64 %c) nounwind readnone {
 entry:
   %cmp = icmp eq i32 %a, 234
   %cond = select i1 %cmp, i64 %b, i64 %c
   ret i64 %cond
 }
 
+; ALL-LABEL: cmov4_ne:
+
+; We won't check the result register since we can't know if the move is first
+; or last. We do know it will be one of two registers so we can at least check
+; that.
+
+; FIXME: Use xori instead of addiu+xor.
+; 32-CMOV-DAG: addiu $[[R0:[0-9]+]], $zero, 234
+; 32-CMOV-DAG: xor $[[R1:[0-9]+]], $4, $[[R0]]
+; 32-CMOV-DAG: lw $[[R2:2]], 16($sp)
+; 32-CMOV-DAG: lw $[[R3:3]], 20($sp)
+; 32-CMOV-DAG: movn $[[R2]], $6, $[[R1]]
+; 32-CMOV-DAG: movn $[[R3]], $7, $[[R1]]
+
+; 32-CMP-DAG:  xori $[[R0:[0-9]+]], $4, 234
+; 32-CMP-DAG:  lw $[[R1:[0-9]+]], 16($sp)
+; 32-CMP-DAG:  lw $[[R2:[0-9]+]], 20($sp)
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $6, $[[R0]]
+; 32-CMP-DAG:  selnez $[[T1:[0-9]+]], $7, $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T2:[0-9]+]], $[[R1]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T3:[0-9]+]], $[[R2]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T2]]
+; 32-CMP-DAG:  or $3, $[[T1]], $[[T3]]
+
+; FIXME: Use xori instead of addiu+xor.
+; 64-CMOV: addiu $[[R0:[0-9]+]], $zero, 234
+; 64-CMOV: xor $[[R1:[0-9]+]], $4, $[[R0]]
+; 64-CMOV: movn ${{[26]}}, $5, $[[R1]]
+
+; 64-CMP-DAG:  xori $[[R0:[0-9]+]], $4, 234
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $5, $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $6, $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+define i64 @cmov4_ne(i32 signext %a, i64 %b, i64 %c) nounwind readnone {
+entry:
+  %cmp = icmp ne i32 %a, 234
+  %cond = select i1 %cmp, i64 %b, i64 %c
+  ret i64 %cond
+}
+
 ; slti and conditional move.
 ;
 ; Check that, pattern
@@ -189,7 +263,7 @@ entry:
 ; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @slti0(i32 %a) {
+define i32 @slti0(i32 signext %a) {
 entry:
   %cmp = icmp sgt i32 %a, 32766
   %cond = select i1 %cmp, i32 3, i32 5
@@ -228,7 +302,7 @@ entry:
 ; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @slti1(i32 %a) {
+define i32 @slti1(i32 signext %a) {
 entry:
   %cmp = icmp sgt i32 %a, 32767
   %cond = select i1 %cmp, i32 7, i32 5
@@ -263,7 +337,7 @@ entry:
 ; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @slti2(i32 %a) {
+define i32 @slti2(i32 signext %a) {
 entry:
   %cmp = icmp sgt i32 %a, -32769
   %cond = select i1 %cmp, i32 3, i32 5
@@ -306,7 +380,7 @@ entry:
 ; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @slti3(i32 %a) {
+define i32 @slti3(i32 signext %a) {
 entry:
   %cmp = icmp sgt i32 %a, -32770
   %cond = select i1 %cmp, i32 3, i32 5
@@ -493,7 +567,7 @@ entry:
 ; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @sltiu0(i32 %a) {
+define i32 @sltiu0(i32 signext %a) {
 entry:
   %cmp = icmp ugt i32 %a, 32766
   %cond = select i1 %cmp, i32 3, i32 5
@@ -532,7 +606,7 @@ entry:
 ; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @sltiu1(i32 %a) {
+define i32 @sltiu1(i32 signext %a) {
 entry:
   %cmp = icmp ugt i32 %a, 32767
   %cond = select i1 %cmp, i32 7, i32 5
@@ -567,7 +641,7 @@ entry:
 ; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @sltiu2(i32 %a) {
+define i32 @sltiu2(i32 signext %a) {
 entry:
   %cmp = icmp ugt i32 %a, -32769
   %cond = select i1 %cmp, i32 3, i32 5
@@ -610,7 +684,7 @@ entry:
 ; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @sltiu3(i32 %a) {
+define i32 @sltiu3(i32 signext %a) {
 entry:
   %cmp = icmp ugt i32 %a, -32770
   %cond = select i1 %cmp, i32 3, i32 5
@@ -623,7 +697,7 @@ entry:
 ; doesn't generate conditional moves
 ; for constant operands whose difference is |1|
 
-define i32 @slti4(i32 %a) nounwind readnone {
+define i32 @slti4(i32 signext %a) nounwind readnone {
   %1 = icmp slt i32 %a, 7
   %2 = select i1 %1, i32 4, i32 3
   ret i32 %2
@@ -649,7 +723,7 @@ define i32 @slti4(i32 %a) nounwind readnone {
 ; 64-CMP-NOT:  seleqz
 ; 64-CMP-NOT:  selnez
 
-define i32 @slti5(i32 %a) nounwind readnone {
+define i32 @slti5(i32 signext %a) nounwind readnone {
   %1 = icmp slt i32 %a, 7
   %2 = select i1 %1, i32 -3, i32 -4
   ret i32 %2
@@ -675,7 +749,7 @@ define i32 @slti5(i32 %a) nounwind readnone {
 ; 64-CMP-NOT:  seleqz
 ; 64-CMP-NOT:  selnez
 
-define i32 @slti6(i32 %a) nounwind readnone {
+define i32 @slti6(i32 signext %a) nounwind readnone {
   %1 = icmp slt i32 %a, 7
   %2 = select i1 %1, i32 3, i32 4
   ret i32 %2
@@ -683,24 +757,9 @@ define i32 @slti6(i32 %a) nounwind readnone {
 
 ; ALL-LABEL: slti6:
 
-; 32-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
-; 32-CMOV-DAG: xori [[R1]], [[R1]], 1
-; 32-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
-; 32-CMOV-NOT: movn
-
-; 32-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
-; 32-CMP-DAG:  xori [[R1]], [[R1]], 1
-; 32-CMP-DAG:  addiu [[R2:\$[0-9]+]], [[R1]], 3
-; 32-CMP-NOT:  seleqz
-; 32-CMP-NOT:  selnez
-
-; 64-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
-; 64-CMOV-DAG: xori [[R1]], [[R1]], 1
-; 64-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
-; 64-CMOV-NOT: movn
-
-; 64-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
-; 64-CMP-DAG:  xori [[R1]], [[R1]], 1
-; 64-CMP-DAG:  addiu [[R2:\$[0-9]+]], [[R1]], 3
-; 64-CMP-NOT:  seleqz
-; 64-CMP-NOT:  selnez
+; ALL-DAG: addiu [[R1:\$[0-9]+]], $zero, 6
+; ALL-DAG: slt [[R1]], [[R1]], $4
+; ALL-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
+; ALL-NOT: movn
+; ALL-NOT:  seleqz
+; ALL-NOT:  selnez
diff --git a/test/CodeGen/Mips/const-mult.ll b/test/CodeGen/Mips/const-mult.ll
index 1862021..60b2a88 100644
--- a/test/CodeGen/Mips/const-mult.ll
+++ b/test/CodeGen/Mips/const-mult.ll
@@ -5,7 +5,7 @@
 ; CHECK: sll $[[R0:[0-9]+]], $4, 2
 ; CHECK: addu ${{[0-9]+}}, $[[R0]], $4
 
-define i32 @mul5_32(i32 %a) {
+define i32 @mul5_32(i32 signext %a) {
 entry:
   %mul = mul nsw i32 %a, 5
   ret i32 %mul
@@ -17,7 +17,7 @@ entry:
 ; CHECK-DAG: sll $[[R2:[0-9]+]], $4, 5
 ; CHECK:     subu ${{[0-9]+}}, $[[R2]], $[[R1]]
 
-define i32 @mul27_32(i32 %a) {
+define i32 @mul27_32(i32 signext %a) {
 entry:
   %mul = mul nsw i32 %a, 27
   ret i32 %mul
@@ -29,7 +29,7 @@ entry:
 ; CHECK-DAG: sll $[[R2:[0-9]+]], $4, 31
 ; CHECK:     addu ${{[0-9]+}}, $[[R2]], $[[R1]]
 
-define i32 @muln2147483643_32(i32 %a) {
+define i32 @muln2147483643_32(i32 signext %a) {
 entry:
   %mul = mul nsw i32 %a, -2147483643
   ret i32 %mul
@@ -41,7 +41,7 @@ entry:
 ; CHECK64-DAG: dsll $[[R2:[0-9]+]], $4, 63
 ; CHECK64:     daddu ${{[0-9]+}}, $[[R2]], $[[R1]]
 
-define i64 @muln9223372036854775805_64(i64 %a) {
+define i64 @muln9223372036854775805_64(i64 signext %a) {
 entry:
   %mul = mul nsw i64 %a, -9223372036854775805
   ret i64 %mul
diff --git a/test/CodeGen/Mips/countleading.ll b/test/CodeGen/Mips/countleading.ll
index 6e63cff..b7aad04 100644
--- a/test/CodeGen/Mips/countleading.ll
+++ b/test/CodeGen/Mips/countleading.ll
@@ -11,7 +11,7 @@
 ;   MIPS32-GT-R1 - MIPS64r1 and above (does not include MIPS64's)
 ;   MIPS64-GT-R1 - MIPS64r1 and above
 
-define i32 @ctlz_i32(i32 %X) nounwind readnone {
+define i32 @ctlz_i32(i32 signext %X) nounwind readnone {
 entry:
 ; ALL-LABEL: ctlz_i32:
 
@@ -27,7 +27,7 @@ entry:
 
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 
-define i32 @ctlo_i32(i32 %X) nounwind readnone {
+define i32 @ctlo_i32(i32 signext %X) nounwind readnone {
 entry:
 ; ALL-LABEL: ctlo_i32:
 
diff --git a/test/CodeGen/Mips/ctlz-v.ll b/test/CodeGen/Mips/ctlz-v.ll
index 270f404..3d580e5 100644
--- a/test/CodeGen/Mips/ctlz-v.ll
+++ b/test/CodeGen/Mips/ctlz-v.ll
@@ -6,12 +6,12 @@ declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1)
 define <2 x i32> @ctlzv2i32(<2 x i32> %x) {
 entry:
 ; MIPS32: clz     $2, $4
-; MIPS32: jr      $ra
 ; MIPS32: clz     $3, $5
 
-; MIPS64: clz     $2, $4
-; MIPS64: jr      $ra
-; MIPS64: clz     $3, $5
+; MIPS64-DAG: sll $[[A0:[0-9]+]], $4, 0
+; MIPS64-DAG: clz $2, $[[A0]]
+; MIPS64-DAG: sll $[[A1:[0-9]+]], $5, 0
+; MIPS64-DAG: clz $3, $[[A1]]
 
   %ret = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %x, i1 true)
   ret <2 x i32> %ret
diff --git a/test/CodeGen/Mips/cttz-v.ll b/test/CodeGen/Mips/cttz-v.ll
index 9470441..85f69f9 100644
--- a/test/CodeGen/Mips/cttz-v.ll
+++ b/test/CodeGen/Mips/cttz-v.ll
@@ -18,14 +18,16 @@ entry:
 ; MIPS32-DAG: jr      $ra
 ; MIPS32-DAG: subu    $3, $[[R4]], $[[R8]]
 
-; MIPS64-DAG: addiu   $[[R0:[0-9]+]], $4, -1
-; MIPS64-DAG: not     $[[R1:[0-9]+]], $4
+; MIPS64-DAG: sll     $[[A0:[0-9]+]], $4, 0
+; MIPS64-DAG: addiu   $[[R0:[0-9]+]], $[[A0]], -1
+; MIPS64-DAG: not     $[[R1:[0-9]+]], $[[A0]]
 ; MIPS64-DAG: and     $[[R2:[0-9]+]], $[[R1]], $[[R0]]
 ; MIPS64-DAG: clz     $[[R3:[0-9]+]], $[[R2]]
 ; MIPS64-DAG: addiu   $[[R4:[0-9]+]], $zero, 32
 ; MIPS64-DAG: subu    $2, $[[R4]], $[[R3]]
-; MIPS64-DAG: addiu   $[[R5:[0-9]+]], $5, -1
-; MIPS64-DAG: not     $[[R6:[0-9]+]], $5
+; MIPS64-DAG: sll     $[[A1:[0-9]+]], $5, 0
+; MIPS64-DAG: addiu   $[[R5:[0-9]+]], $[[A1]], -1
+; MIPS64-DAG: not     $[[R6:[0-9]+]], $[[A1]]
 ; MIPS64-DAG: and     $[[R7:[0-9]+]], $[[R6]], $[[R5]]
 ; MIPS64-DAG: clz     $[[R8:[0-9]+]], $[[R7]]
 ; MIPS64-DAG: jr      $ra
diff --git a/test/CodeGen/Mips/divrem.ll b/test/CodeGen/Mips/divrem.ll
index 97f8360..a9cfe0f 100644
--- a/test/CodeGen/Mips/divrem.ll
+++ b/test/CodeGen/Mips/divrem.ll
@@ -27,7 +27,7 @@
 @g0 = common global i32 0, align 4
 @g1 = common global i32 0, align 4
 
-define i32 @sdiv1(i32 %a0, i32 %a1) nounwind readnone {
+define i32 @sdiv1(i32 signext %a0, i32 signext %a1) nounwind readnone {
 entry:
 ; ALL-LABEL: sdiv1:
 
@@ -54,7 +54,7 @@ entry:
   ret i32 %div
 }
 
-define i32 @srem1(i32 %a0, i32 %a1) nounwind readnone {
+define i32 @srem1(i32 signext %a0, i32 signext %a1) nounwind readnone {
 entry:
 ; ALL-LABEL: srem1:
 
@@ -81,7 +81,7 @@ entry:
   ret i32 %rem
 }
 
-define i32 @udiv1(i32 %a0, i32 %a1) nounwind readnone {
+define i32 @udiv1(i32 zeroext %a0, i32 zeroext %a1) nounwind readnone {
 entry:
 ; ALL-LABEL: udiv1:
 
@@ -107,7 +107,7 @@ entry:
   ret i32 %div
 }
 
-define i32 @urem1(i32 %a0, i32 %a1) nounwind readnone {
+define i32 @urem1(i32 zeroext %a0, i32 zeroext %a1) nounwind readnone {
 entry:
 ; ALL-LABEL: urem1:
 
@@ -134,7 +134,7 @@ entry:
   ret i32 %rem
 }
 
-define i32 @sdivrem1(i32 %a0, i32 %a1, i32* nocapture %r) nounwind {
+define i32 @sdivrem1(i32 signext %a0, i32 signext %a1, i32* nocapture %r) nounwind {
 entry:
 ; ALL-LABEL: sdivrem1:
 
@@ -175,7 +175,7 @@ entry:
   ret i32 %div
 }
 
-define i32 @udivrem1(i32 %a0, i32 %a1, i32* nocapture %r) nounwind {
+define i32 @udivrem1(i32 zeroext %a0, i32 zeroext %a1, i32* nocapture %r) nounwind {
 entry:
 ; ALL-LABEL: udivrem1:
 
diff --git a/test/CodeGen/Mips/ehframe-indirect.ll b/test/CodeGen/Mips/ehframe-indirect.ll
index e78497a..b4efb40 100644
--- a/test/CodeGen/Mips/ehframe-indirect.ll
+++ b/test/CodeGen/Mips/ehframe-indirect.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mtriple=mipsel-linux-gnu < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel-linux-android < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-linux-gnu < %s | FileCheck  -check-prefix=CHECK32 %s
+; RUN: llc -mtriple=mipsel-linux-android < %s | FileCheck -check-prefix=CHECK32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu < %s | FileCheck  -check-prefix=CHECK64 %s
+; RUN: llc -mtriple=mips64el-linux-android < %s | FileCheck -check-prefix=CHECK64 %s
 
 define i32 @main() {
 ; CHECK: .cfi_startproc
@@ -27,8 +29,11 @@ declare void @foo()
 ; CHECK: .hidden DW.ref.__gxx_personality_v0
 ; CHECK: .weak DW.ref.__gxx_personality_v0
 ; CHECK: .section .data.DW.ref.__gxx_personality_v0,"aGw",@progbits,DW.ref.__gxx_personality_v0,comdat
-; CHECK: .align 2
+; CHECK32: .align 2
+; CHECK64: .align 3
 ; CHECK: .type DW.ref.__gxx_personality_v0,@object
-; CHECK: .size DW.ref.__gxx_personality_v0, 4
+; CHECK32: .size DW.ref.__gxx_personality_v0, 4
+; CHECK64: .size DW.ref.__gxx_personality_v0, 8
 ; CHECK: DW.ref.__gxx_personality_v0:
-; CHECK: .4byte __gxx_personality_v0
+; CHECK32: .4byte __gxx_personality_v0
+; CHECK64: .8byte __gxx_personality_v0
diff --git a/test/CodeGen/Mips/fastcc.ll b/test/CodeGen/Mips/fastcc.ll
index 8ee7af8..6b022c5 100644
--- a/test/CodeGen/Mips/fastcc.ll
+++ b/test/CodeGen/Mips/fastcc.ll
@@ -1,6 +1,8 @@
 ; RUN: llc  < %s -march=mipsel | FileCheck %s 
 ; RUN: llc  < %s -mtriple=mipsel-none-nacl-gnu \
 ; RUN:  | FileCheck %s -check-prefix=CHECK-NACL
+; RUN: llc  < %s -march=mipsel -mcpu=mips32 -mattr=+nooddspreg | FileCheck %s -check-prefix=NOODDSPREG
+; RUN: llc  < %s -march=mipsel -mcpu=mips32r2 -mattr=+fp64,+nooddspreg | FileCheck %s -check-prefix=FP64-NOODDSPREG
 
 
 @gi0 = external global i32
@@ -80,6 +82,9 @@
 @g15 = external global i32
 @g16 = external global i32
 
+@fa = common global [11 x float] zeroinitializer, align 4
+@da = common global [11 x double] zeroinitializer, align 8
+
 define void @caller0() nounwind {
 entry:
 ; CHECK: caller0
@@ -264,3 +269,164 @@ entry:
   ret void
 }
 
+define void @caller2() {
+entry:
+
+; NOODDSPREG-LABEL:  caller2:
+
+; Check that first 10 arguments are passed in even float registers
+; f0, f2, ... , f18. Check that 11th argument is passed on stack.
+
+; NOODDSPREG-DAG:    lw      $[[R0:[0-9]+]], %got(fa)(${{[0-9]+|gp}})
+; NOODDSPREG-DAG:    lwc1    $f0, 0($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f2, 4($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f4, 8($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f6, 12($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f8, 16($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f10, 20($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f12, 24($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f14, 28($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f16, 32($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f18, 36($[[R0]])
+
+; NOODDSPREG-DAG:    lwc1    $[[F0:f[0-9]*[02468]]], 40($[[R0]])
+; NOODDSPREG-DAG:    swc1    $[[F0]], 0($sp)
+
+  %0 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 0), align 4
+  %1 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 1), align 4
+  %2 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 2), align 4
+  %3 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 3), align 4
+  %4 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 4), align 4
+  %5 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 5), align 4
+  %6 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 6), align 4
+  %7 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 7), align 4
+  %8 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 8), align 4
+  %9 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 9), align 4
+  %10 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 10), align 4
+  tail call fastcc void @callee2(float %0, float %1, float %2, float %3,
+                                 float %4, float %5, float %6, float %7,
+                                 float %8, float %9, float %10)
+  ret void
+}
+
+define fastcc void @callee2(float %a0, float %a1, float %a2, float %a3,
+                            float %a4, float %a5, float %a6, float %a7,
+                            float %a8, float %a9, float %a10) {
+entry:
+
+; NOODDSPREG-LABEL:  callee2:
+
+; NOODDSPREG:        addiu   $sp, $sp, -[[OFFSET:[0-9]+]]
+
+; Check that first 10 arguments are received in even float registers
+; f0, f2, ... , f18. Check that 11th argument is received on stack.
+
+; NOODDSPREG-DAG:    lw      $[[R0:[0-9]+]], %got(fa)(${{[0-9]+|gp}})
+; NOODDSPREG-DAG:    swc1    $f0, 0($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f2, 4($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f4, 8($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f6, 12($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f8, 16($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f10, 20($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f12, 24($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f14, 28($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f16, 32($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f18, 36($[[R0]])
+
+; NOODDSPREG-DAG:    lwc1    $[[F0:f[0-9]*[02468]]], [[OFFSET]]($sp)
+; NOODDSPREG-DAG:    swc1    $[[F0]], 40($[[R0]])
+
+  store float %a0, float* getelementptr ([11 x float]* @fa, i32 0, i32 0), align 4
+  store float %a1, float* getelementptr ([11 x float]* @fa, i32 0, i32 1), align 4
+  store float %a2, float* getelementptr ([11 x float]* @fa, i32 0, i32 2), align 4
+  store float %a3, float* getelementptr ([11 x float]* @fa, i32 0, i32 3), align 4
+  store float %a4, float* getelementptr ([11 x float]* @fa, i32 0, i32 4), align 4
+  store float %a5, float* getelementptr ([11 x float]* @fa, i32 0, i32 5), align 4
+  store float %a6, float* getelementptr ([11 x float]* @fa, i32 0, i32 6), align 4
+  store float %a7, float* getelementptr ([11 x float]* @fa, i32 0, i32 7), align 4
+  store float %a8, float* getelementptr ([11 x float]* @fa, i32 0, i32 8), align 4
+  store float %a9, float* getelementptr ([11 x float]* @fa, i32 0, i32 9), align 4
+  store float %a10, float* getelementptr ([11 x float]* @fa, i32 0, i32 10), align 4
+  ret void
+}
+
+define void @caller3() {
+entry:
+
+; FP64-NOODDSPREG-LABEL:  caller3:
+
+; Check that first 10 arguments are passed in even float registers
+; f0, f2, ... , f18. Check that 11th argument is passed on stack.
+
+; FP64-NOODDSPREG-DAG:    lw      $[[R0:[0-9]+]], %got(da)(${{[0-9]+|gp}})
+; FP64-NOODDSPREG-DAG:    ldc1    $f0, 0($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f2, 8($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f4, 16($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f6, 24($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f8, 32($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f10, 40($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f12, 48($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f14, 56($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f16, 64($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f18, 72($[[R0]])
+
+; FP64-NOODDSPREG-DAG:    ldc1    $[[F0:f[0-9]*[02468]]], 80($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $[[F0]], 0($sp)
+
+  %0 = load double* getelementptr ([11 x double]* @da, i32 0, i32 0), align 8
+  %1 = load double* getelementptr ([11 x double]* @da, i32 0, i32 1), align 8
+  %2 = load double* getelementptr ([11 x double]* @da, i32 0, i32 2), align 8
+  %3 = load double* getelementptr ([11 x double]* @da, i32 0, i32 3), align 8
+  %4 = load double* getelementptr ([11 x double]* @da, i32 0, i32 4), align 8
+  %5 = load double* getelementptr ([11 x double]* @da, i32 0, i32 5), align 8
+  %6 = load double* getelementptr ([11 x double]* @da, i32 0, i32 6), align 8
+  %7 = load double* getelementptr ([11 x double]* @da, i32 0, i32 7), align 8
+  %8 = load double* getelementptr ([11 x double]* @da, i32 0, i32 8), align 8
+  %9 = load double* getelementptr ([11 x double]* @da, i32 0, i32 9), align 8
+  %10 = load double* getelementptr ([11 x double]* @da, i32 0, i32 10), align 8
+  tail call fastcc void @callee3(double %0, double %1, double %2, double %3,
+                                 double %4, double %5, double %6, double %7,
+                                 double %8, double %9, double %10)
+  ret void
+}
+
+define fastcc void @callee3(double %a0, double %a1, double %a2, double %a3,
+                            double %a4, double %a5, double %a6, double %a7,
+                            double %a8, double %a9, double %a10) {
+entry:
+
+; FP64-NOODDSPREG-LABEL:  callee3:
+
+; FP64-NOODDSPREG:        addiu   $sp, $sp, -[[OFFSET:[0-9]+]]
+
+; Check that first 10 arguments are received in even float registers
+; f0, f2, ... , f18. Check that 11th argument is received on stack.
+
+; FP64-NOODDSPREG-DAG:    lw      $[[R0:[0-9]+]], %got(da)(${{[0-9]+|gp}})
+; FP64-NOODDSPREG-DAG:    sdc1    $f0, 0($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f2, 8($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f4, 16($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f6, 24($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f8, 32($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f10, 40($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f12, 48($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f14, 56($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f16, 64($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f18, 72($[[R0]])
+
+; FP64-NOODDSPREG-DAG:    ldc1    $[[F0:f[0-9]*[02468]]], [[OFFSET]]($sp)
+; FP64-NOODDSPREG-DAG:    sdc1    $[[F0]], 80($[[R0]])
+
+  store double %a0, double* getelementptr ([11 x double]* @da, i32 0, i32 0), align 8
+  store double %a1, double* getelementptr ([11 x double]* @da, i32 0, i32 1), align 8
+  store double %a2, double* getelementptr ([11 x double]* @da, i32 0, i32 2), align 8
+  store double %a3, double* getelementptr ([11 x double]* @da, i32 0, i32 3), align 8
+  store double %a4, double* getelementptr ([11 x double]* @da, i32 0, i32 4), align 8
+  store double %a5, double* getelementptr ([11 x double]* @da, i32 0, i32 5), align 8
+  store double %a6, double* getelementptr ([11 x double]* @da, i32 0, i32 6), align 8
+  store double %a7, double* getelementptr ([11 x double]* @da, i32 0, i32 7), align 8
+  store double %a8, double* getelementptr ([11 x double]* @da, i32 0, i32 8), align 8
+  store double %a9, double* getelementptr ([11 x double]* @da, i32 0, i32 9), align 8
+  store double %a10, double* getelementptr ([11 x double]* @da, i32 0, i32 10), align 8
+  ret void
+}
diff --git a/test/CodeGen/Mips/fp16instrinsmc.ll b/test/CodeGen/Mips/fp16instrinsmc.ll
index 7ced36c..84d3814 100644
--- a/test/CodeGen/Mips/fp16instrinsmc.ll
+++ b/test/CodeGen/Mips/fp16instrinsmc.ll
@@ -385,7 +385,7 @@ entry:
 ; Function Attrs: nounwind
 declare double @exp2(double) #0
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/test/CodeGen/Mips/fp64a.ll b/test/CodeGen/Mips/fp64a.ll
new file mode 100644
index 0000000..fadce5c
--- /dev/null
+++ b/test/CodeGen/Mips/fp64a.ll
@@ -0,0 +1,161 @@
+; Test that the FP64A ABI performs double precision moves via a spill/reload.
+; The requirement is really that odd-numbered double precision registers do not
+; use mfc1/mtc1 to move the bottom 32-bits (because the hardware will redirect
+; this to the top 32-bits of the even register) but we have to make the decision
+; before register allocation so we do this for all double-precision values.
+
+; We don't test MIPS32r1 since support for 64-bit coprocessors (such as a 64-bit
+; FPU) on a 32-bit architecture was added in MIPS32r2.
+; FIXME: We currently don't test that attempting to use FP64 on MIPS32r1 is an
+;        error either. This is because a large number of CodeGen tests are
+;        incorrectly using this case. We should fix those test cases then add
+;        this check here.
+
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-NO-FP64A-BE
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=fp64,nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-FP64A
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-NO-FP64A-LE
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fp64,nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-FP64A
+
+; RUN: llc -march=mips64 -mcpu=mips64 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-NO-FP64A
+; RUN: not llc -march=mips64 -mcpu=mips64 -mattr=fp64,nooddspreg < %s 2>&1 | FileCheck %s -check-prefix=64-FP64A
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-NO-FP64A
+; RUN: not llc -march=mips64el -mcpu=mips64 -mattr=fp64,nooddspreg < %s 2>&1 | FileCheck %s -check-prefix=64-FP64A
+
+; 64-FP64A: LLVM ERROR: -mattr=+nooddspreg requires the O32 ABI.
+
+declare double @dbl();
+
+define double @call1(double %d, ...) {
+  ret double %d
+
+; ALL-LABEL:            call1:
+
+; 32R2-NO-FP64A-LE-NOT:     addiu   $sp, $sp
+; 32R2-NO-FP64A-LE:         mtc1    $4, $f0
+; 32R2-NO-FP64A-LE:         mthc1   $5, $f0
+
+; 32R2-NO-FP64A-BE-NOT:     addiu   $sp, $sp
+; 32R2-NO-FP64A-BE:         mtc1    $5, $f0
+; 32R2-NO-FP64A-BE:         mthc1   $4, $f0
+
+; 32R2-FP64A:               addiu   $sp, $sp, -8
+; 32R2-FP64A:               sw      $4, 0($sp)
+; 32R2-FP64A:               sw      $5, 4($sp)
+; 32R2-FP64A:               ldc1    $f0, 0($sp)
+
+; 64-NO-FP64A:              daddiu  $sp, $sp, -64
+; 64-NO-FP64A:              mov.d   $f0, $f12
+}
+
+define double @call2(i32 %i, double %d) {
+  ret double %d
+
+; ALL-LABEL:        call2:
+
+; 32R2-NO-FP64A-LE:     mtc1    $6, $f0
+; 32R2-NO-FP64A-LE:     mthc1   $7, $f0
+
+; 32R2-NO-FP64A-BE:     mtc1    $7, $f0
+; 32R2-NO-FP64A-BE:     mthc1   $6, $f0
+
+; 32R2-FP64A:           addiu   $sp, $sp, -8
+; 32R2-FP64A:           sw      $6, 0($sp)
+; 32R2-FP64A:           sw      $7, 4($sp)
+; 32R2-FP64A:           ldc1    $f0, 0($sp)
+
+; 64-NO-FP64A-NOT:      daddiu  $sp, $sp
+; 64-NO-FP64A:          mov.d   $f0, $f13
+}
+
+define double @call3(float %f1, float %f2, double %d) {
+  ret double %d
+
+; ALL-LABEL:        call3:
+
+; 32R2-NO-FP64A-LE:     mtc1    $6, $f0
+; 32R2-NO-FP64A-LE:     mthc1   $7, $f0
+
+; 32R2-NO-FP64A-BE:     mtc1    $7, $f0
+; 32R2-NO-FP64A-BE:     mthc1   $6, $f0
+
+; 32R2-FP64A:           addiu   $sp, $sp, -8
+; 32R2-FP64A:           sw      $6, 0($sp)
+; 32R2-FP64A:           sw      $7, 4($sp)
+; 32R2-FP64A:           ldc1    $f0, 0($sp)
+
+; 64-NO-FP64A-NOT:      daddiu  $sp, $sp
+; 64-NO-FP64A:          mov.d   $f0, $f14
+}
+
+define double @call4(float %f, double %d, ...) {
+  ret double %d
+
+; ALL-LABEL:        call4:
+
+; 32R2-NO-FP64A-LE:     mtc1    $6, $f0
+; 32R2-NO-FP64A-LE:     mthc1   $7, $f0
+
+; 32R2-NO-FP64A-BE:     mtc1    $7, $f0
+; 32R2-NO-FP64A-BE:     mthc1   $6, $f0
+
+; 32R2-FP64A:           addiu   $sp, $sp, -8
+; 32R2-FP64A:           sw      $6, 0($sp)
+; 32R2-FP64A:           sw      $7, 4($sp)
+; 32R2-FP64A:           ldc1    $f0, 0($sp)
+
+; 64-NO-FP64A:          daddiu  $sp, $sp, -48
+; 64-NO-FP64A:          mov.d   $f0, $f13
+}
+
+define double @call5(double %a, double %b, ...) {
+  %1 = fsub double %a, %b
+  ret double %1
+
+; ALL-LABEL:            call5:
+
+; 32R2-NO-FP64A-LE-DAG:     mtc1    $4, $[[T0:f[0-9]+]]
+; 32R2-NO-FP64A-LE-DAG:     mthc1   $5, $[[T0:f[0-9]+]]
+; 32R2-NO-FP64A-LE-DAG:     mtc1    $6, $[[T1:f[0-9]+]]
+; 32R2-NO-FP64A-LE-DAG:     mthc1   $7, $[[T1:f[0-9]+]]
+; 32R2-NO-FP64A-LE:         sub.d   $f0, $[[T0]], $[[T1]]
+
+; 32R2-NO-FP64A-BE-DAG:     mtc1    $5, $[[T0:f[0-9]+]]
+; 32R2-NO-FP64A-BE-DAG:     mthc1   $4, $[[T0:f[0-9]+]]
+; 32R2-NO-FP64A-BE-DAG:     mtc1    $7, $[[T1:f[0-9]+]]
+; 32R2-NO-FP64A-BE-DAG:     mthc1   $6, $[[T1:f[0-9]+]]
+; 32R2-NO-FP64A-BE:         sub.d   $f0, $[[T0]], $[[T1]]
+
+; 32R2-FP64A:               addiu   $sp, $sp, -8
+; 32R2-FP64A:               sw      $6, 0($sp)
+; 32R2-FP64A:               sw      $7, 4($sp)
+; 32R2-FP64A:               ldc1    $[[T1:f[0-9]+]], 0($sp)
+; 32R2-FP64A:               sw      $4, 0($sp)
+; 32R2-FP64A:               sw      $5, 4($sp)
+; 32R2-FP64A:               ldc1    $[[T0:f[0-9]+]], 0($sp)
+; 32R2-FP64A:               sub.d   $f0, $[[T0]], $[[T1]]
+
+; 64-NO-FP64A:              sub.d   $f0, $f12, $f13
+}
+
+define double @move_from(double %d) {
+  %1 = call double @dbl()
+  %2 = call double @call2(i32 0, double %1)
+  ret double %2
+
+; ALL-LABEL:        move_from:
+
+; 32R2-NO-FP64A-LE-DAG: mfc1    $6, $f0
+; 32R2-NO-FP64A-LE-DAG: mfhc1   $7, $f0
+
+; 32R2-NO-FP64A-BE-DAG: mfc1    $7, $f0
+; 32R2-NO-FP64A-BE-DAG: mfhc1   $6, $f0
+
+; 32R2-FP64A:           addiu   $sp, $sp, -32
+; 32R2-FP64A:           sdc1    $f0, 16($sp)
+; 32R2-FP64A:           lw      $6, 16($sp)
+; FIXME: This store is redundant
+; 32R2-FP64A:           sdc1    $f0, 16($sp)
+; 32R2-FP64A:           lw      $7, 20($sp)
+
+; 64-NO-FP64A:          mov.d   $f13, $f0
+}
diff --git a/test/CodeGen/Mips/fpxx.ll b/test/CodeGen/Mips/fpxx.ll
new file mode 100644
index 0000000..7e2ed22
--- /dev/null
+++ b/test/CodeGen/Mips/fpxx.ll
@@ -0,0 +1,221 @@
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-NOFPXX
+; RUN: llc -march=mipsel -mcpu=mips32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-FPXX
+
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-NOFPXX
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-FPXX
+
+; RUN: llc -march=mips64 -mcpu=mips4 < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-NOFPXX
+; RUN: not llc -march=mips64 -mcpu=mips4 -mattr=fpxx < %s 2>&1 | FileCheck %s -check-prefix=4-FPXX
+
+; RUN: llc -march=mips64 -mcpu=mips64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-NOFPXX
+; RUN: not llc -march=mips64 -mcpu=mips64 -mattr=fpxx < %s 2>&1 | FileCheck %s -check-prefix=64-FPXX
+
+; RUN-TODO: llc -march=mips64 -mcpu=mips4 -mattr=-n64,+o32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-O32-NOFPXX
+; RUN-TODO: llc -march=mips64 -mcpu=mips4 -mattr=-n64,+o32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-O32-FPXX
+
+; RUN-TODO: llc -march=mips64 -mcpu=mips64 -mattr=-n64,+o32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-O32-NOFPXX
+; RUN-TODO: llc -march=mips64 -mcpu=mips64 -mattr=-n64,+o32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-O32-FPXX
+
+declare double @dbl();
+
+; 4-FPXX:  LLVM ERROR: FPXX is not permitted for the N32/N64 ABI's.
+; 64-FPXX: LLVM ERROR: FPXX is not permitted for the N32/N64 ABI's.
+
+define double @test1(double %d, ...) {
+  ret double %d
+
+; ALL-LABEL: test1:
+
+; 32-NOFPXX:     mtc1    $4, $f0
+; 32-NOFPXX:     mtc1    $5, $f1
+
+; 32-FPXX:       addiu   $sp, $sp, -8
+; 32-FPXX:       sw      $4, 0($sp)
+; 32-FPXX:       sw      $5, 4($sp)
+; 32-FPXX:       ldc1    $f0, 0($sp)
+
+; 32R2-NOFPXX:   mtc1    $4, $f0
+; 32R2-NOFPXX:   mthc1   $5, $f0
+
+; 32R2-FPXX:     mtc1    $4, $f0
+; 32R2-FPXX:     mthc1   $5, $f0
+
+; floats/doubles are not passed in integer registers for n64, so dmtc1 is not used.
+; 4-NOFPXX:      mov.d   $f0, $f12
+
+; 64-NOFPXX:     mov.d   $f0, $f12
+}
+
+define double @test2(i32 %i, double %d) {
+  ret double %d
+
+; ALL-LABEL: test2:
+
+; 32-NOFPXX:     mtc1    $6, $f0
+; 32-NOFPXX:     mtc1    $7, $f1
+
+; 32-FPXX:       addiu   $sp, $sp, -8
+; 32-FPXX:       sw      $6, 0($sp)
+; 32-FPXX:       sw      $7, 4($sp)
+; 32-FPXX:       ldc1    $f0, 0($sp)
+
+; 32R2-NOFPXX:   mtc1    $6, $f0
+; 32R2-NOFPXX:   mthc1   $7, $f0
+
+; 32R2-FPXX:     mtc1    $6, $f0
+; 32R2-FPXX:     mthc1   $7, $f0
+
+; 4-NOFPXX:      mov.d   $f0, $f13
+
+; 64-NOFPXX:     mov.d   $f0, $f13
+}
+
+define double @test3(float %f1, float %f2, double %d) {
+  ret double %d
+
+; ALL-LABEL: test3:
+
+; 32-NOFPXX:     mtc1    $6, $f0
+; 32-NOFPXX:     mtc1    $7, $f1
+
+; 32-FPXX:       addiu   $sp, $sp, -8
+; 32-FPXX:       sw      $6, 0($sp)
+; 32-FPXX:       sw      $7, 4($sp)
+; 32-FPXX:       ldc1    $f0, 0($sp)
+
+; 32R2-NOFPXX:   mtc1    $6, $f0
+; 32R2-NOFPXX:   mthc1   $7, $f0
+
+; 32R2-FPXX:     mtc1    $6, $f0
+; 32R2-FPXX:     mthc1   $7, $f0
+
+; 4-NOFPXX:      mov.d   $f0, $f14
+
+; 64-NOFPXX:     mov.d   $f0, $f14
+}
+
+define double @test4(float %f, double %d, ...) {
+  ret double %d
+
+; ALL-LABEL: test4:
+
+; 32-NOFPXX:     mtc1    $6, $f0
+; 32-NOFPXX:     mtc1    $7, $f1
+
+; 32-FPXX:       addiu   $sp, $sp, -8
+; 32-FPXX:       sw      $6, 0($sp)
+; 32-FPXX:       sw      $7, 4($sp)
+; 32-FPXX:       ldc1    $f0, 0($sp)
+
+; 32R2-NOFPXX:   mtc1    $6, $f0
+; 32R2-NOFPXX:   mthc1   $7, $f0
+
+; 32R2-FPXX:     mtc1    $6, $f0
+; 32R2-FPXX:     mthc1   $7, $f0
+
+; 4-NOFPXX:      mov.d   $f0, $f13
+
+; 64-NOFPXX:     mov.d   $f0, $f13
+}
+
+define double @test5() {
+  ret double 0.000000e+00
+
+; ALL-LABEL: test5:
+
+; 32-NOFPXX:     mtc1    $zero, $f0
+; 32-NOFPXX:     mtc1    $zero, $f1
+
+; 32-FPXX:       addiu   $sp, $sp, -8
+; 32-FPXX:       sw      $zero, 0($sp)
+; 32-FPXX:       sw      $zero, 4($sp)
+; 32-FPXX:       ldc1    $f0, 0($sp)
+
+; 32R2-NOFPXX:   mtc1    $zero, $f0
+; 32R2-NOFPXX:   mthc1   $zero, $f0
+
+; 32R2-FPXX:     mtc1    $zero, $f0
+; 32R2-FPXX:     mthc1   $zero, $f0
+
+; 4-NOFPXX:      dmtc1 $zero, $f0
+
+; 64-NOFPXX:     dmtc1 $zero, $f0
+}
+
+define double @test6(double %a, double %b, ...) {
+  %1 = fsub double %a, %b
+  ret double %1
+
+; ALL-LABEL:     test6:
+
+; 32-NOFPXX-DAG:     mtc1    $4, $[[T0:f[0-9]+]]
+; 32-NOFPXX-DAG:     mtc1    $5, ${{f[0-9]*[13579]}}
+; 32-NOFPXX-DAG:     mtc1    $6, $[[T1:f[0-9]+]]
+; 32-NOFPXX-DAG:     mtc1    $7, ${{f[0-9]*[13579]}}
+; 32-NOFPXX:         sub.d   $f0, $[[T0]], $[[T1]]
+
+; 32-FPXX:           addiu   $sp, $sp, -8
+; 32-FPXX:           sw      $6, 0($sp)
+; 32-FPXX:           sw      $7, 4($sp)
+; 32-FPXX:           ldc1    $[[T1:f[0-9]+]], 0($sp)
+; 32-FPXX:           sw      $4, 0($sp)
+; 32-FPXX:           sw      $5, 4($sp)
+; 32-FPXX:           ldc1    $[[T0:f[0-9]+]], 0($sp)
+; 32-FPXX:           sub.d   $f0, $[[T0]], $[[T1]]
+
+; 32R2-NOFPXX-DAG:   mtc1    $4, $[[T0:f[0-9]+]]
+; 32R2-NOFPXX-DAG:   mthc1   $5, $[[T0]]
+; 32R2-NOFPXX-DAG:   mtc1    $6, $[[T1:f[0-9]+]]
+; 32R2-NOFPXX-DAG:   mthc1   $7, $[[T1]]
+; 32R2-NOFPXX:       sub.d   $f0, $[[T0]], $[[T1]]
+
+; 32R2-FPXX-DAG:     mtc1    $4, $[[T0:f[0-9]+]]
+; 32R2-FPXX-DAG:     mthc1   $5, $[[T0]]
+; 32R2-FPXX-DAG:     mtc1    $6, $[[T1:f[0-9]+]]
+; 32R2-FPXX-DAG:     mthc1   $7, $[[T1]]
+; 32R2-FPXX:         sub.d   $f0, $[[T0]], $[[T1]]
+
+; floats/doubles are not passed in integer registers for n64, so dmtc1 is not used.
+; 4-NOFPXX:          sub.d   $f0, $f12, $f13
+
+; floats/doubles are not passed in integer registers for n64, so dmtc1 is not used.
+; 64-NOFPXX:         sub.d   $f0, $f12, $f13
+}
+
+define double @move_from1(double %d) {
+  %1 = call double @dbl()
+  %2 = call double @test2(i32 0, double %1)
+  ret double %2
+
+; ALL-LABEL:   move_from1:
+
+; 32-NOFPXX-DAG:   mfc1    $6, $f0
+; 32-NOFPXX-DAG:   mfc1    $7, $f1
+
+; 32-FPXX:         addiu   $sp, $sp, -32
+; 32-FPXX:         sdc1    $f0, 16($sp)
+; 32-FPXX:         lw      $6, 16($sp)
+; FIXME: This store is redundant
+; 32-FPXX:         sdc1    $f0, 16($sp)
+; 32-FPXX:         lw      $7, 20($sp)
+
+; 32R2-NOFPXX-DAG: mfc1    $6, $f0
+; 32R2-NOFPXX-DAG: mfhc1   $7, $f0
+
+; 32R2-FPXX-DAG:   mfc1    $6, $f0
+; 32R2-FPXX-DAG:   mfhc1   $7, $f0
+
+; floats/doubles are not passed in integer registers for n64, so dmfc1 is not used.
+; We can't use inline assembly to force a copy either because trying to force
+; a copy to a GPR this way fails with ; "couldn't allocate input reg for
+; constraint 'r'". It therefore seems impossible to test the generation of dmfc1
+; in a simple test.
+; 4-NOFPXX:        mov.d   $f13, $f0
+
+; floats/doubles are not passed in integer registers for n64, so dmfc1 is not used.
+; We can't use inline assembly to force a copy either because trying to force
+; a copy to a GPR this way fails with ; "couldn't allocate input reg for
+; constraint 'r'". It therefore seems impossible to test the generation of dmfc1
+; in a simple test.
+; 64-NOFPXX:       mov.d   $f13, $f0
+}
diff --git a/test/CodeGen/Mips/gpreg-lazy-binding.ll b/test/CodeGen/Mips/gpreg-lazy-binding.ll
index 88e596b..3a636d8 100644
--- a/test/CodeGen/Mips/gpreg-lazy-binding.ll
+++ b/test/CodeGen/Mips/gpreg-lazy-binding.ll
@@ -25,3 +25,11 @@ entry:
   ret void
 }
 
+define void @no_lazy(void (i32)* %pf) {
+
+; CHECK-LABEL:  no_lazy
+; CHECK-NOT:    gp_disp
+
+  tail call void %pf(i32 1)
+  ret void
+}
diff --git a/test/CodeGen/Mips/hfptrcall.ll b/test/CodeGen/Mips/hfptrcall.ll
index 9df8d90..683952d 100644
--- a/test/CodeGen/Mips/hfptrcall.ll
+++ b/test/CodeGen/Mips/hfptrcall.ll
@@ -118,8 +118,8 @@ entry:
 
 declare i32 @printf(i8*, ...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 
 
diff --git a/test/CodeGen/Mips/init-array.ll b/test/CodeGen/Mips/init-array.ll
index f96ce26..1ca182d 100644
--- a/test/CodeGen/Mips/init-array.ll
+++ b/test/CodeGen/Mips/init-array.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple mipsel-unknown-linux -use-init-array < %s | FileCheck  %s
+; RUN: llc -mtriple mipsel-unknown-linux < %s | FileCheck  %s
 
 target triple = "mipsel-unknown-linux"
 
diff --git a/test/CodeGen/Mips/inlineasm-operand-code.ll b/test/CodeGen/Mips/inlineasm-operand-code.ll
index 6512851..3d9dec7 100644
--- a/test/CodeGen/Mips/inlineasm-operand-code.ll
+++ b/test/CodeGen/Mips/inlineasm-operand-code.ll
@@ -65,6 +65,33 @@ entry:
 ;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},$0
 ;CHECK_LITTLE_32:    #NO_APP
   tail call i32 asm sideeffect "addiu $0,$1,${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
+
+; z with non-zero and the "r"(register) and "J"(integer zero) constraints
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
+;CHECK_LITTLE_32:    #NO_APP
+  call void asm sideeffect "mtc0 ${0:z}, $$12", "Jr"(i32 7) nounwind
+
+; z with zero and the "r"(register) and "J"(integer zero) constraints
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    mtc0 $0, ${{[0-9]+}}
+;CHECK_LITTLE_32:    #NO_APP
+  call void asm sideeffect "mtc0 ${0:z}, $$12", "Jr"(i32 0) nounwind
+
+; z with non-zero and just the "r"(register) constraint
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
+;CHECK_LITTLE_32:    #NO_APP
+  call void asm sideeffect "mtc0 ${0:z}, $$12", "r"(i32 7) nounwind
+
+; z with zero and just the "r"(register) constraint
+; FIXME: Check for $0, instead of other registers.
+;        We should be using $0 directly in this case, not real registers.
+;        When the materialization of 0 gets fixed, this test will fail.
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
+;CHECK_LITTLE_32:    #NO_APP
+  call void asm sideeffect "mtc0 ${0:z}, $$12", "r"(i32 0) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/llvm-ir/mul.ll b/test/CodeGen/Mips/llvm-ir/mul.ll
new file mode 100644
index 0000000..1674124
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/mul.ll
@@ -0,0 +1,181 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=M2
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=32R1-R2 -check-prefix=32R1
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=32R1-R2 -check-prefix=32R2
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=32R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=M4
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=64R1-R2
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=64R1-R2
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:     -check-prefix=ALL -check-prefix=64R6
+
+define signext i1 @mul_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: mul_i1:
+
+  ; M2:         mult    $4, $5
+  ; M2:         mflo    $[[T0:[0-9]+]]
+  ; M2:         sll     $[[T0]], $[[T0]], 31
+  ; M2:         sra     $2, $[[T0]], 31
+
+  ; 32R1-R2:    mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R1-R2:    sll     $[[T0]], $[[T0]], 31
+  ; 32R1-R2:    sra     $2, $[[T0]], 31
+
+  ; 32R6:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R6:       sll     $[[T0]], $[[T0]], 31
+  ; 32R6:       sra     $2, $[[T0]], 31
+
+  ; M4:         mult    $4, $5
+  ; M4:         mflo    $[[T0:[0-9]+]]
+  ; M4:         sll     $[[T0]], $[[T0]], 31
+  ; M4:         sra     $2, $[[T0]], 31
+
+  ; 64R1-R2:    mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R1-R2:    sll     $[[T0]], $[[T0]], 31
+  ; 64R1-R2:    sra     $2, $[[T0]], 31
+
+  ; 64R6:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R6:       sll     $[[T0]], $[[T0]], 31
+  ; 64R6:       sra     $2, $[[T0]], 31
+
+  %r = mul i1 %a, %b
+  ret i1 %r
+}
+
+define signext i8 @mul_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: mul_i8:
+
+  ; M2:         mult    $4, $5
+  ; M2:         mflo    $[[T0:[0-9]+]]
+  ; M2:         sll     $[[T0]], $[[T0]], 24
+  ; M2:         sra     $2, $[[T0]], 24
+
+  ; 32R1:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R1:       sll     $[[T0]], $[[T0]], 24
+  ; 32R1:       sra     $2, $[[T0]], 24
+
+  ; 32R2:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R2:       seb     $2, $[[T0]]
+
+  ; 32R6:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R6:       seb     $2, $[[T0]]
+
+  ; M4:         mult    $4, $5
+  ; M4:         mflo    $[[T0:[0-9]+]]
+  ; M4:         sll     $[[T0]], $[[T0]], 24
+  ; M4:         sra     $2, $[[T0]], 24
+
+  ; 64R1:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R1:       sll     $[[T0]], $[[T0]], 24
+  ; 64R1:       sra     $2, $[[T0]], 24
+
+  ; 64R2:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R2:       seb     $2, $[[T0]]
+
+  ; 64R6:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R6:       seb     $2, $[[T0]]
+  %r = mul i8 %a, %b
+  ret i8 %r
+}
+
+define signext i16 @mul_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: mul_i16:
+
+  ; M2:         mult    $4, $5
+  ; M2:         mflo    $[[T0:[0-9]+]]
+  ; M2:         sll     $[[T0]], $[[T0]], 16
+  ; M2:         sra     $2, $[[T0]], 16
+
+  ; 32R1:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R1:       sll     $[[T0]], $[[T0]], 16
+  ; 32R1:       sra     $2, $[[T0]], 16
+
+  ; 32R2:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R2:       seh     $2, $[[T0]]
+
+  ; 32R6:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R6:       seh     $2, $[[T0]]
+
+  ; M4:         mult    $4, $5
+  ; M4:         mflo    $[[T0:[0-9]+]]
+  ; M4:         sll     $[[T0]], $[[T0]], 16
+  ; M4:         sra     $2, $[[T0]], 16
+
+  ; 64R1:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R1:       sll     $[[T0]], $[[T0]], 16
+  ; 64R1:       sra     $2, $[[T0]], 16
+
+  ; 64R2:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R2:       seh     $2, $[[T0]]
+
+  ; 64R6:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R6:       seh     $2, $[[T0]]
+  %r = mul i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @mul_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: mul_i32:
+
+  ; M2:         mult    $4, $5
+  ; M2:         mflo    $2
+
+  ; 32R1-R2:    mul     $2, $4, $5
+  ; 32R6:       mul     $2, $4, $5
+
+  ; 64R1-R2:    mul     $2, $4, $5
+  ; 64R6:       mul     $2, $4, $5
+  %r = mul i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @mul_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: mul_i64:
+
+  ; M2:         mult    $4, $7
+  ; M2:         mflo    $[[T0:[0-9]+]]
+  ; M2:         mult    $5, $6
+  ; M2:         mflo    $[[T1:[0-9]+]]
+  ; M2:         multu   $5, $7
+  ; M2:         mflo    $3
+  ; M2:         mfhi    $4
+  ; M2:         addu    $[[T2:[0-9]+]], $4, $[[T1]]
+  ; M2:         addu    $2, $[[T2]], $[[T0]]
+
+  ; 32R1-R2:    multu   $5, $7
+  ; 32R1-R2:    mflo    $3
+  ; 32R1-R2:    mfhi    $[[T0:[0-9]+]]
+  ; 32R1-R2:    mul     $[[T1:[0-9]+]], $4, $7
+  ; 32R1-R2:    mul     $[[T2:[0-9]+]], $5, $6
+  ; 32R1-R2:    addu    $[[T0]], $[[T0]], $[[T2:[0-9]+]]
+  ; 32R1-R2:    addu    $2, $[[T0]], $[[T1]]
+
+  ; 32R6:       mul     $[[T0:[0-9]+]], $5, $6
+  ; 32R6:       muhu    $[[T1:[0-9]+]], $5, $7
+  ; 32R6:       addu    $[[T0]], $[[T1]], $[[T0]]
+  ; 32R6:       mul     $[[T2:[0-9]+]], $4, $7
+  ; 32R6:       addu    $2, $[[T0]], $[[T2]]
+  ; 32R6:       mul     $3, $5, $7
+
+  ; M4:         dmult   $4, $5
+  ; M4:         mflo    $2
+
+  ; 64R1-R2:    dmult   $4, $5
+  ; 64R1-R2:    mflo    $2
+
+  ; 64R6:       dmul    $2, $4, $5
+
+  %r = mul i64 %a, %b
+  ret i64 %r
+}
diff --git a/test/CodeGen/Mips/load-store-left-right.ll b/test/CodeGen/Mips/load-store-left-right.ll
index a3f5ebf..f6d0e8d 100644
--- a/test/CodeGen/Mips/load-store-left-right.ll
+++ b/test/CodeGen/Mips/load-store-left-right.ll
@@ -47,7 +47,7 @@ entry:
   ret i32 %0
 }
 
-define void @store_SI(i32 %a) nounwind {
+define void @store_SI(i32 signext %a) nounwind {
 entry:
 ; ALL-LABEL: store_SI:
 
@@ -201,7 +201,7 @@ entry:
   ret void
 }
 
-define void @store_SI_trunc_from_i64(i32 %a) nounwind {
+define void @store_SI_trunc_from_i64(i32 signext %a) nounwind {
 entry:
 ; ALL-LABEL: store_SI_trunc_from_i64:
 
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index a403744..b9b52be 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -13,7 +13,7 @@
 
 @x = external global i32
 
-define void @test1(i32 %s) {
+define void @test1(i32 signext %s) {
 entry:
   %cmp = icmp eq i32 %s, 0
   br i1 %cmp, label %end, label %then
diff --git a/test/CodeGen/Mips/madd-msub.ll b/test/CodeGen/Mips/madd-msub.ll
index 8222967..b0c3ff6 100644
--- a/test/CodeGen/Mips/madd-msub.ll
+++ b/test/CodeGen/Mips/madd-msub.ll
@@ -76,26 +76,14 @@ entry:
 ; 32R6-DAG:      muhu $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $2, $[[T3]], $[[T2]]
 
-; 64-DAG:        dsll $[[T0:[0-9]+]], $4, 32
-; 64-DAG:        dsrl $[[T1:[0-9]+]], $[[T0]], 32
-; 64-DAG:        dsll $[[T2:[0-9]+]], $5, 32
-; 64-DAG:        dsrl $[[T3:[0-9]+]], $[[T2]], 32
-; 64-DAG:        d[[m:m]]ult $[[T3]], $[[T1]]
-; 64-DAG:        [[m]]flo $[[T4:[0-9]+]]
-; 64-DAG:        dsll $[[T5:[0-9]+]], $6, 32
-; 64-DAG:        dsrl $[[T6:[0-9]+]], $[[T5]], 32
-; 64-DAG:        daddu $2, $[[T4]], $[[T6]]
-
-; 64R6-DAG:      dsll $[[T0:[0-9]+]], $4, 32
-; 64R6-DAG:      dsrl $[[T1:[0-9]+]], $[[T0]], 32
-; 64R6-DAG:      dsll $[[T2:[0-9]+]], $5, 32
-; 64R6-DAG:      dsrl $[[T3:[0-9]+]], $[[T2]], 32
-; 64R6-DAG:      dmul $[[T4:[0-9]+]], $[[T3]], $[[T1]]
-; 64R6-DAG:      dsll $[[T5:[0-9]+]], $6, 32
-; 64R6-DAG:      dsrl $[[T6:[0-9]+]], $[[T5]], 32
-; 64R6-DAG:      daddu $2, $[[T4]], $[[T6]]
-
-define i64 @madd2(i32 %a, i32 %b, i32 %c) nounwind readnone {
+; 64-DAG:        d[[m:m]]ult $5, $4
+; 64-DAG:        [[m]]flo $[[T0:[0-9]+]]
+; 64-DAG:        daddu $2, $[[T0]], $6
+
+; 64R6-DAG:      dmul $[[T0:[0-9]+]], $5, $4
+; 64R6-DAG:      daddu $2, $[[T0]], $6
+
+define i64 @madd2(i32 zeroext %a, i32 zeroext %b, i32 zeroext %c) nounwind readnone {
 entry:
   %conv = zext i32 %a to i64
   %conv2 = zext i32 %b to i64
@@ -214,26 +202,14 @@ entry:
 ; 32R6-DAG:      negu $2, $[[T3]]
 ; 32R6-DAG:      subu $3, $6, $[[T1]]
 
-; 64-DAG:        dsll $[[T0:[0-9]+]], $4, 32
-; 64-DAG:        dsrl $[[T1:[0-9]+]], $[[T0]], 32
-; 64-DAG:        dsll $[[T2:[0-9]+]], $5, 32
-; 64-DAG:        dsrl $[[T3:[0-9]+]], $[[T2]], 32
-; 64-DAG:        d[[m:m]]ult $[[T3]], $[[T1]]
-; 64-DAG:        [[m]]flo $[[T4:[0-9]+]]
-; 64-DAG:        dsll $[[T5:[0-9]+]], $6, 32
-; 64-DAG:        dsrl $[[T6:[0-9]+]], $[[T5]], 32
-; 64-DAG:        dsubu $2, $[[T6]], $[[T4]]
-
-; 64R6-DAG:      dsll $[[T0:[0-9]+]], $4, 32
-; 64R6-DAG:      dsrl $[[T1:[0-9]+]], $[[T0]], 32
-; 64R6-DAG:      dsll $[[T2:[0-9]+]], $5, 32
-; 64R6-DAG:      dsrl $[[T3:[0-9]+]], $[[T2]], 32
-; 64R6-DAG:      dmul $[[T4:[0-9]+]], $[[T3]], $[[T1]]
-; 64R6-DAG:      dsll $[[T5:[0-9]+]], $6, 32
-; 64R6-DAG:      dsrl $[[T6:[0-9]+]], $[[T5]], 32
-; 64R6-DAG:      dsubu $2, $[[T6]], $[[T4]]
-
-define i64 @msub2(i32 %a, i32 %b, i32 %c) nounwind readnone {
+; 64-DAG:        d[[m:m]]ult $5, $4
+; 64-DAG:        [[m]]flo $[[T0:[0-9]+]]
+; 64-DAG:        dsubu $2, $6, $[[T0]]
+
+; 64R6-DAG:      dmul $[[T0:[0-9]+]], $5, $4
+; 64R6-DAG:      dsubu $2, $6, $[[T0]]
+
+define i64 @msub2(i32 zeroext %a, i32 zeroext %b, i32 zeroext %c) nounwind readnone {
 entry:
   %conv = zext i32 %c to i64
   %conv2 = zext i32 %a to i64
diff --git a/test/CodeGen/Mips/micromips-addiu.ll b/test/CodeGen/Mips/micromips-addiu.ll
new file mode 100644
index 0000000..c5bee34
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-addiu.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+
+@x = global i32 65504, align 4
+@y = global i32 60929, align 4
+@z = global i32 60929, align 4
+@.str = private unnamed_addr constant [7 x i8] c"%08x \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @x, align 4
+  %addiu1 = add i32 %0, -7
+  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+                                  ([7 x i8]* @.str, i32 0, i32 0), i32 %addiu1)
+
+  %1 = load i32* @y, align 4
+  %addiu2 = add i32 %1, 55
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+                                  ([7 x i8]* @.str, i32 0, i32 0), i32 %addiu2)
+
+  %2 = load i32* @z, align 4
+  %addiu3 = add i32 %2, 24
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+                                  ([7 x i8]* @.str, i32 0, i32 0), i32 %addiu3)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
+
+; CHECK: addius5  ${{[0-9]+}}, -7
+; CHECK: addiu    ${{[0-9]+}}, ${{[0-9]+}}, 55
+; CHECK: addiur2  ${{[2-7]|16|17}}, ${{[2-7]|16|17}}, 24
diff --git a/test/CodeGen/Mips/micromips-andi.ll b/test/CodeGen/Mips/micromips-andi.ll
new file mode 100644
index 0000000..b82d2b0
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-andi.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+
+@x = global i32 65504, align 4
+@y = global i32 60929, align 4
+@.str = private unnamed_addr constant [7 x i8] c"%08x \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @x, align 4
+  %and1 = and i32 %0, 4
+  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+                                  ([7 x i8]* @.str, i32 0, i32 0), i32 %and1)
+
+  %1 = load i32* @y, align 4
+  %and2 = and i32 %1, 5
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+                                  ([7 x i8]* @.str, i32 0, i32 0), i32 %and2)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
+
+; CHECK: andi16 ${{[2-7]|16|17}}, ${{[2-7]|16|17}}
+; CHECK: andi   ${{[0-9]+}}, ${{[0-9]+}}
diff --git a/test/CodeGen/Mips/micromips-delay-slot.ll b/test/CodeGen/Mips/micromips-delay-slot.ll
new file mode 100644
index 0000000..4bab97a
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-delay-slot.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %0 = load i32* %a.addr, align 4
+  %shl = shl i32 %0, 2
+  %call = call i32 @bar(i32 %shl)
+  ret i32 %call
+}
+
+declare i32 @bar(i32) #1
+
+; CHECK: nop
+
diff --git a/test/CodeGen/Mips/micromips-rdhwr-directives.ll b/test/CodeGen/Mips/micromips-rdhwr-directives.ll
new file mode 100644
index 0000000..af40a87
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-rdhwr-directives.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=mipsel -mcpu=mips32 -relocation-model=static < %s \
+; RUN:   -mattr=+micromips | FileCheck %s
+
+@a = external thread_local global i32
+
+define i32 @foo() nounwind readonly {
+entry:
+; CHECK: .set  push
+; CHECK: .set  mips32r2
+; CHECK: rdhwr
+; CHECK: .set  pop
+
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
diff --git a/test/CodeGen/Mips/micromips-shift.ll b/test/CodeGen/Mips/micromips-shift.ll
new file mode 100644
index 0000000..8215010
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-shift.ll
@@ -0,0 +1,44 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+
+@a = global i32 10, align 4
+@b = global i32 0, align 4
+@c = global i32 10, align 4
+@d = global i32 0, align 4
+
+define i32 @shift_left() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %shl = shl i32 %0, 4
+  store i32 %shl, i32* @b, align 4
+
+  %1 = load i32* @c, align 4
+  %shl1 = shl i32 %1, 10
+  store i32 %shl1, i32* @d, align 4
+
+  ret i32 0
+}
+
+; CHECK: sll16  ${{[2-7]|16|17}}, ${{[2-7]|16|17}}, {{[0-7]}}
+; CHECK: sll    ${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}
+
+@i = global i32 10654, align 4
+@j = global i32 0, align 4
+@m = global i32 10, align 4
+@n = global i32 0, align 4
+
+define i32 @shift_right() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %shr = lshr i32 %0, 4
+  store i32 %shr, i32* @j, align 4
+
+  %1 = load i32* @m, align 4
+  %shr1 = lshr i32 %1, 10
+  store i32 %shr1, i32* @n, align 4
+
+  ret i32 0
+}
+
+; CHECK: srl16  ${{[2-7]|16|17}}, ${{[2-7]|16|17}}, {{[0-7]}}
+; CHECK: srl    ${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}
diff --git a/test/CodeGen/Mips/mips16-hf-attr-2.ll b/test/CodeGen/Mips/mips16-hf-attr-2.ll
new file mode 100644
index 0000000..60c6eaa
--- /dev/null
+++ b/test/CodeGen/Mips/mips16-hf-attr-2.ll
@@ -0,0 +1,45 @@
+; Check that stubs generation for mips16 hard-float mode does not depend
+; on the function 'use-soft-float' attribute's value.
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel \
+; RUN:     -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s
+
+define void @bar_sf() #1 {
+; CHECK: bar_sf:
+entry:
+  %call1 = call float @foo(float 1.000000e+00)
+; CHECK: lw $3, %call16(foo)($2)
+; CHECK-NOT: lw $5, %got(__mips16_call_stub_sf_1)($3)
+  ret void
+}
+
+define void @bar_hf() #0 {
+; CHECK: bar_hf:
+entry:
+  %call1 = call float @foo(float 1.000000e+00)
+; CHECK: lw $2, %call16(foo)($3)
+; CHECK: lw $5, %got(__mips16_call_stub_sf_1)($3)
+  ret void
+}
+
+declare float @foo(float) #2
+
+attributes #0 = {
+  nounwind
+  "less-precise-fpmad"="false" "no-frame-pointer-elim"="true"
+  "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false"
+  "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
+  "unsafe-fp-math"="false" "use-soft-float"="false"
+}
+attributes #1 = {
+  nounwind
+  "less-precise-fpmad"="false" "no-frame-pointer-elim"="true"
+  "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false"
+  "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
+  "unsafe-fp-math"="false" "use-soft-float"="true"
+}
+attributes #2 = {
+  "less-precise-fpmad"="false" "no-frame-pointer-elim"="true"
+  "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false"
+  "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
+  "unsafe-fp-math"="false" "use-soft-float"="true"
+}
diff --git a/test/CodeGen/Mips/mips16-hf-attr.ll b/test/CodeGen/Mips/mips16-hf-attr.ll
index d9ad629..c6ad442 100644
--- a/test/CodeGen/Mips/mips16-hf-attr.ll
+++ b/test/CodeGen/Mips/mips16-hf-attr.ll
@@ -3,8 +3,8 @@
 ; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel \
 ; RUN:     -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s
 
-define void @bar_sf() #0 {
-; CHECK: bar_sf:
+define void @bar_hf() #0 {
+; CHECK: bar_hf:
 entry:
   %call1 = call float @foo(float 1.000000e+00)
 ; CHECK: lw $2, %call16(foo)($3)
@@ -12,12 +12,12 @@ entry:
   ret void
 }
 
-define void @bar_hf() #1 {
-; CHECK: bar_hf:
+define void @bar_sf() #1 {
+; CHECK: bar_sf:
 entry:
   %call1 = call float @foo(float 1.000000e+00)
-; CHECK: lw $2, %call16(foo)($3)
-; CHECK: lw $5, %got(__mips16_call_stub_sf_1)($3)
+; CHECK: lw $3, %call16(foo)($2)
+; CHECK-NOT: lw $5, %got(__mips16_call_stub_sf_1)($3)
   ret void
 }
 
diff --git a/test/CodeGen/Mips/mips64-f128.ll b/test/CodeGen/Mips/mips64-f128.ll
index 7f7d515..6987d4a 100644
--- a/test/CodeGen/Mips/mips64-f128.ll
+++ b/test/CodeGen/Mips/mips64-f128.ll
@@ -114,7 +114,7 @@ entry:
 ; ALL-LABEL: conv_LD_UInt:
 ; ALL: ld $25, %call16(__floatunsitf)
 
-define fp128 @conv_LD_UInt(i32 %a) {
+define fp128 @conv_LD_UInt(i32 signext %a) {
 entry:
   %conv = uitofp i32 %a to fp128
   ret fp128 %conv
@@ -545,7 +545,7 @@ entry:
 
 ; ALL-LABEL: load_LD_float:
 ; ALL: ld   $[[R0:[0-9]+]], %got_disp(gf1)
-; ALL: lw   $4, 0($[[R0]])
+; ALL: lwu  $4, 0($[[R0]])
 ; ALL: ld   $25, %call16(__extendsftf2)
 ; ALL: jalr $25
 
@@ -635,7 +635,7 @@ entry:
 ; CMP_CC_FMT-DAG: selnez $[[NE2:[0-9]+]], $7, $[[CC]]
 ; CMP_CC_FMT-DAG: or $4, $[[NE2]], $[[EQ2]]
 
-define fp128 @select_LD(i32 %a, i64, fp128 %b, fp128 %c) {
+define fp128 @select_LD(i32 signext %a, i64, fp128 %b, fp128 %c) {
 entry:
   %tobool = icmp ne i32 %a, 0
   %cond = select i1 %tobool, fp128 %b, fp128 %c
diff --git a/test/CodeGen/Mips/mips64-sret.ll b/test/CodeGen/Mips/mips64-sret.ll
index 7a52c3d..ed494e9 100644
--- a/test/CodeGen/Mips/mips64-sret.ll
+++ b/test/CodeGen/Mips/mips64-sret.ll
@@ -11,7 +11,7 @@ entry:
   ret void
 }
 
-define void @bar(i32 %v, i32* noalias sret %agg.result) nounwind {
+define void @bar(i32 signext %v, i32* noalias sret %agg.result) nounwind {
 entry:
 ; CHECK-LABEL: bar:
 ; CHECK: sw $4, 0($5)
diff --git a/test/CodeGen/Mips/mno-ldc1-sdc1.ll b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
index 244b03d..db653ea 100644
--- a/test/CodeGen/Mips/mno-ldc1-sdc1.ll
+++ b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
@@ -123,7 +123,7 @@ entry:
 ; 32R1-LE-PIC-DAG:    sw $[[R1]], 4(${{[0-9]+}})
 
 ; 32R2-LE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
-; 32R2-LE-PIC-DAG:    mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-LE-PIC-DAG:    mfhc1 $[[R1:[0-9]+]], $f12
 ; 32R2-LE-PIC-DAG:    sw $[[R0]], 0(${{[0-9]+}})
 ; 32R2-LE-PIC-DAG:    sw $[[R1]], 4(${{[0-9]+}})
 
@@ -140,7 +140,7 @@ entry:
 ; 32R1-LE-STATIC-DAG: sw $[[R1]], 4($[[R3]])
 
 ; 32R2-LE-STATIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
-; 32R2-LE-STATIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-LE-STATIC-DAG: mfhc1 $[[R1:[0-9]+]], $f12
 ; 32R2-LE-STATIC-DAG: lui $[[R2:[0-9]+]], %hi(g0)
 ; 32R2-LE-STATIC-DAG: sw $[[R0]], %lo(g0)($[[R2]])
 ; 32R2-LE-STATIC-DAG: addiu $[[R3:[0-9]+]], $[[R2]], %lo(g0)
@@ -159,7 +159,7 @@ entry:
 ; 32R1-BE-PIC-DAG:    sw $[[R0]], 4(${{[0-9]+}})
 
 ; 32R2-BE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
-; 32R2-BE-PIC-DAG:    mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-BE-PIC-DAG:    mfhc1 $[[R1:[0-9]+]], $f12
 ; 32R2-BE-PIC-DAG:    sw $[[R1]], 0(${{[0-9]+}})
 ; 32R2-BE-PIC-DAG:    sw $[[R0]], 4(${{[0-9]+}})
 
@@ -225,7 +225,7 @@ entry:
 ; 32R1-DAG:      sw $[[R1]], 4(${{[0-9]+}})
 
 ; 32R2-DAG:      mfc1 $[[R0:[0-9]+]], $f12
-; 32R2-DAG:      mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-DAG:      mfhc1 $[[R1:[0-9]+]], $f12
 ; 32R2-DAG:      sw $[[R0]], 0(${{[0-9]+}})
 ; 32R2-DAG:      sw $[[R1]], 4(${{[0-9]+}})
 
diff --git a/test/CodeGen/Mips/msa/arithmetic_float.ll b/test/CodeGen/Mips/msa/arithmetic_float.ll
index 86e57ac..9aae284 100644
--- a/test/CodeGen/Mips/msa/arithmetic_float.ll
+++ b/test/CodeGen/Mips/msa/arithmetic_float.ll
@@ -276,8 +276,8 @@ define void @fexp2_v4f32_2(<4 x float>* %c, <4 x float>* %a) nounwind {
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = tail call <4 x float> @llvm.exp2.v4f32 (<4 x float> %1)
   %3 = fmul <4 x float> <float 2.0, float 2.0, float 2.0, float 2.0>, %2
-  ; CHECK-DAG: lui [[R3:\$[0-9]+]], 16384
-  ; CHECK-DAG: fill.w [[R4:\$w[0-9]+]], [[R3]]
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: ffint_u.w [[R4:\$w[0-9]+]], [[R3]]
   ; CHECK-DAG: fexp2.w [[R5:\$w[0-9]+]], [[R4]], [[R1]]
   store <4 x float> %3, <4 x float>* %c
   ; CHECK-DAG: st.w [[R5]], 0($4)
@@ -287,16 +287,14 @@ define void @fexp2_v4f32_2(<4 x float>* %c, <4 x float>* %a) nounwind {
 }
 
 define void @fexp2_v2f64_2(<2 x double>* %c, <2 x double>* %a) nounwind {
-  ; CHECK:      .8byte 4611686018427387904
-  ; CHECK-NEXT: .8byte 4611686018427387904
   ; CHECK: fexp2_v2f64_2:
 
   %1 = load <2 x double>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = tail call <2 x double> @llvm.exp2.v2f64 (<2 x double> %1)
   %3 = fmul <2 x double> <double 2.0, double 2.0>, %2
-  ; CHECK-DAG: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[G_PTR]])
+  ; CHECK-DAG: ldi.d [[R2:\$w[0-9]+]], 1
+  ; CHECK-DAG: ffint_u.d [[R3:\$w[0-9]+]], [[R2]]
   ; CHECK-DAG: fexp2.d [[R4:\$w[0-9]+]], [[R3]], [[R1]]
   store <2 x double> %3, <2 x double>* %c
   ; CHECK-DAG: st.d [[R4]], 0($4)
diff --git a/test/CodeGen/Mips/msa/frameindex.ll b/test/CodeGen/Mips/msa/frameindex.ll
index 07e67bf..ebec465 100644
--- a/test/CodeGen/Mips/msa/frameindex.ll
+++ b/test/CodeGen/Mips/msa/frameindex.ll
@@ -36,10 +36,10 @@ define void @loadstore_v16i8_just_over_simm10() nounwind {
   %2 = alloca [497 x i8] ; Push the frame just over 512 bytes
 
   %3 = load volatile <16 x i8>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 512
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 512
   ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <16 x i8> %3, <16 x i8>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 512
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 512
   ; MIPS32-AE: st.b [[R1]], 0([[BASE]])
 
   ret void
@@ -53,12 +53,12 @@ define void @loadstore_v16i8_just_under_simm16() nounwind {
   %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
 
   %3 = load volatile <16 x i8>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <16 x i8> %3, <16 x i8>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.b [[R1]], 0([[BASE]])
 
   ret void
@@ -72,12 +72,12 @@ define void @loadstore_v16i8_just_over_simm16() nounwind {
   %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
 
   %3 = load volatile <16 x i8>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <16 x i8> %3, <16 x i8>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.b [[R1]], 0([[BASE]])
 
   ret void
@@ -107,10 +107,10 @@ define void @loadstore_v8i16_unaligned() nounwind {
   %5 = getelementptr [2 x <8 x i16>]* %4, i32 0, i32 0
 
   %6 = load volatile <8 x i16>* %5
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1
   ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <8 x i16> %6, <8 x i16>* %5
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1
   ; MIPS32-AE: st.h [[R1]], 0([[BASE]])
 
   ret void
@@ -139,10 +139,10 @@ define void @loadstore_v8i16_just_over_simm10() nounwind {
   %2 = alloca [1009 x i8] ; Push the frame just over 1024 bytes
 
   %3 = load volatile <8 x i16>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1024
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1024
   ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <8 x i16> %3, <8 x i16>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1024
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1024
   ; MIPS32-AE: st.h [[R1]], 0([[BASE]])
 
   ret void
@@ -156,12 +156,12 @@ define void @loadstore_v8i16_just_under_simm16() nounwind {
   %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
 
   %3 = load volatile <8 x i16>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <8 x i16> %3, <8 x i16>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.h [[R1]], 0([[BASE]])
 
   ret void
@@ -175,12 +175,12 @@ define void @loadstore_v8i16_just_over_simm16() nounwind {
   %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
 
   %3 = load volatile <8 x i16>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <8 x i16> %3, <8 x i16>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.h [[R1]], 0([[BASE]])
 
   ret void
@@ -210,10 +210,10 @@ define void @loadstore_v4i32_unaligned() nounwind {
   %5 = getelementptr [2 x <4 x i32>]* %4, i32 0, i32 0
 
   %6 = load volatile <4 x i32>* %5
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1
   ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <4 x i32> %6, <4 x i32>* %5
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1
   ; MIPS32-AE: st.w [[R1]], 0([[BASE]])
 
   ret void
@@ -242,10 +242,10 @@ define void @loadstore_v4i32_just_over_simm10() nounwind {
   %2 = alloca [2033 x i8] ; Push the frame just over 2048 bytes
 
   %3 = load volatile <4 x i32>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 2048
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 2048
   ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <4 x i32> %3, <4 x i32>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 2048
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 2048
   ; MIPS32-AE: st.w [[R1]], 0([[BASE]])
 
   ret void
@@ -259,12 +259,12 @@ define void @loadstore_v4i32_just_under_simm16() nounwind {
   %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
 
   %3 = load volatile <4 x i32>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <4 x i32> %3, <4 x i32>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.w [[R1]], 0([[BASE]])
 
   ret void
@@ -278,12 +278,12 @@ define void @loadstore_v4i32_just_over_simm16() nounwind {
   %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
 
   %3 = load volatile <4 x i32>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <4 x i32> %3, <4 x i32>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.w [[R1]], 0([[BASE]])
 
   ret void
@@ -313,10 +313,10 @@ define void @loadstore_v2i64_unaligned() nounwind {
   %5 = getelementptr [2 x <2 x i64>]* %4, i32 0, i32 0
 
   %6 = load volatile <2 x i64>* %5
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1
   ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <2 x i64> %6, <2 x i64>* %5
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1
   ; MIPS32-AE: st.d [[R1]], 0([[BASE]])
 
   ret void
@@ -345,10 +345,10 @@ define void @loadstore_v2i64_just_over_simm10() nounwind {
   %2 = alloca [4081 x i8] ; Push the frame just over 4096 bytes
 
   %3 = load volatile <2 x i64>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 4096
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 4096
   ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <2 x i64> %3, <2 x i64>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 4096
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 4096
   ; MIPS32-AE: st.d [[R1]], 0([[BASE]])
 
   ret void
@@ -362,12 +362,12 @@ define void @loadstore_v2i64_just_under_simm16() nounwind {
   %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
 
   %3 = load volatile <2 x i64>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <2 x i64> %3, <2 x i64>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.d [[R1]], 0([[BASE]])
 
   ret void
@@ -381,12 +381,12 @@ define void @loadstore_v2i64_just_over_simm16() nounwind {
   %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
 
   %3 = load volatile <2 x i64>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <2 x i64> %3, <2 x i64>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.d [[R1]], 0([[BASE]])
 
   ret void
diff --git a/test/CodeGen/Mips/no-odd-spreg.ll b/test/CodeGen/Mips/no-odd-spreg.ll
index b42ed6a..572e940 100644
--- a/test/CodeGen/Mips/no-odd-spreg.ll
+++ b/test/CodeGen/Mips/no-odd-spreg.ll
@@ -1,10 +1,14 @@
-; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG -check-prefix=ODDSPREG-NO-EMIT
 ; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOODDSPREG
-; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG -check-prefix=ODDSPREG-NO-EMIT
 ; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64,+nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fpxx,-nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG -check-prefix=ODDSPREG-EMIT
 
-; ODDSPREG:       .module oddspreg
-; NOODDSPREG:     .module nooddspreg
+; We don't emit a directive unless we need to. This is to support versions of
+; GAS which do not support the directive.
+; ODDSPREG-EMIT:        .module oddspreg
+; ODDSPREG-NO-EMIT-NOT: .module oddspreg
+; NOODDSPREG:           .module nooddspreg
 
 define float @two_floats(float %a) {
 entry:
diff --git a/test/CodeGen/Mips/nomips16.ll b/test/CodeGen/Mips/nomips16.ll
index 0affb16..5f7d74e 100644
--- a/test/CodeGen/Mips/nomips16.ll
+++ b/test/CodeGen/Mips/nomips16.ll
@@ -33,6 +33,6 @@ entry:
 ; CHECK: 	.end	nofoo
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
diff --git a/test/CodeGen/Mips/o32_cc.ll b/test/CodeGen/Mips/o32_cc.ll
index 08e5aab..c28f9ab 100644
--- a/test/CodeGen/Mips/o32_cc.ll
+++ b/test/CodeGen/Mips/o32_cc.ll
@@ -1,12 +1,13 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel < %s | FileCheck -check-prefix=FP32EL %s
-; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck -check-prefix=FP64EL %s
+; RUN: llc -march=mipsel < %s | FileCheck -check-prefix=ALL %s
+; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck -check-prefix=ALL %s
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck -check-prefix=ALL -check-prefix=NO-MFHC1 %s
+; RUN: llc -march=mipsel -mcpu=mips32r2              < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-MFHC1 %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-MFHC1 %s
 
 ; $f12, $f14
-; CHECK-LABEL: testlowercall0:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: ldc1 $f14, %lo
+; ALL-LABEL: testlowercall0:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       ldc1 $f14, %lo
 define void @testlowercall0() nounwind {
 entry:
   tail call void @f0(double 5.000000e+00, double 6.000000e+00) nounwind
@@ -16,9 +17,9 @@ entry:
 declare void @f0(double, double)
 
 ; $f12, $f14
-; CHECK-LABEL: testlowercall1:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: lwc1 $f14, %lo
+; ALL-LABEL: testlowercall1:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       lwc1 $f14, %lo
 define void @testlowercall1() nounwind {
 entry:
   tail call void @f1(float 8.000000e+00, float 9.000000e+00) nounwind
@@ -28,9 +29,9 @@ entry:
 declare void @f1(float, float)
 
 ; $f12, $f14
-; CHECK-LABEL: testlowercall2:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: ldc1 $f14, %lo
+; ALL-LABEL: testlowercall2:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       ldc1 $f14, %lo
 define void @testlowercall2() nounwind {
 entry:
   tail call void @f2(float 8.000000e+00, double 6.000000e+00) nounwind
@@ -40,9 +41,9 @@ entry:
 declare void @f2(float, double)
 
 ; $f12, $f14
-; CHECK-LABEL: testlowercall3:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: lwc1 $f14, %lo
+; ALL-LABEL: testlowercall3:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       lwc1 $f14, %lo
 define void @testlowercall3() nounwind {
 entry:
   tail call void @f3(double 5.000000e+00, float 9.000000e+00) nounwind
@@ -52,11 +53,11 @@ entry:
 declare void @f3(double, float)
 
 ; $4, $5, $6, $7
-; CHECK-LABEL: testlowercall4:
-; CHECK-DAG: addiu $4, $zero, 12
-; CHECK-DAG: addiu $5, $zero, 13
-; CHECK-DAG: addiu $6, $zero, 14
-; CHECK-DAG: addiu $7, $zero, 15
+; ALL-LABEL: testlowercall4:
+; ALL-DAG:       addiu $4, $zero, 12
+; ALL-DAG:       addiu $5, $zero, 13
+; ALL-DAG:       addiu $6, $zero, 14
+; ALL-DAG:       addiu $7, $zero, 15
 define void @testlowercall4() nounwind {
 entry:
   tail call void @f4(i32 12, i32 13, i32 14, i32 15) nounwind
@@ -66,11 +67,11 @@ entry:
 declare void @f4(i32, i32, i32, i32)
 
 ; $f12, $6, stack
-; CHECK-LABEL: testlowercall5:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: addiu $6, $zero, 23
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
+; ALL-LABEL: testlowercall5:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       addiu $6, $zero, 23
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 16($sp)
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 20($sp)
 define void @testlowercall5() nounwind {
 entry:
   tail call void @f5(double 1.500000e+01, i32 23, double 1.700000e+01) nounwind
@@ -80,10 +81,10 @@ entry:
 declare void @f5(double, i32, double)
 
 ; $f12, $6, $7
-; CHECK-LABEL: testlowercall6:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: addiu $6, $zero, 33
-; CHECK-DAG: addiu $7, $zero, 24
+; ALL-LABEL: testlowercall6:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       addiu $6, $zero, 33
+; ALL-DAG:       addiu $7, $zero, 24
 define void @testlowercall6() nounwind {
 entry:
   tail call void @f6(double 2.500000e+01, i32 33, i32 24) nounwind
@@ -93,10 +94,10 @@ entry:
 declare void @f6(double, i32, i32)
 
 ; $f12, $5, $6
-; CHECK-LABEL: testlowercall7:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: addiu $5, $zero, 43
-; CHECK-DAG: addiu $6, $zero, 34
+; ALL-LABEL: testlowercall7:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       addiu $5, $zero, 43
+; ALL-DAG:       addiu $6, $zero, 34
 define void @testlowercall7() nounwind {
 entry:
   tail call void @f7(float 1.800000e+01, i32 43, i32 34) nounwind
@@ -106,12 +107,12 @@ entry:
 declare void @f7(float, i32, i32)
 
 ; $4, $5, $6, stack
-; CHECK-LABEL: testlowercall8:
-; CHECK-DAG: addiu $4, $zero, 22
-; CHECK-DAG: addiu $5, $zero, 53
-; CHECK-DAG: addiu $6, $zero, 44
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
+; ALL-LABEL: testlowercall8:
+; ALL-DAG:       addiu $4, $zero, 22
+; ALL-DAG:       addiu $5, $zero, 53
+; ALL-DAG:       addiu $6, $zero, 44
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 16($sp)
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 20($sp)
 define void @testlowercall8() nounwind {
 entry:
   tail call void @f8(i32 22, i32 53, i32 44, double 4.000000e+00) nounwind
@@ -121,11 +122,11 @@ entry:
 declare void @f8(i32, i32, i32, double)
 
 ; $4, $5, $6, $7
-; CHECK-LABEL: testlowercall9:
-; CHECK-DAG: addiu $4, $zero, 32
-; CHECK-DAG: addiu $5, $zero, 63
-; CHECK-DAG: addiu $6, $zero, 54
-; CHECK-DAG: lui $7, 16688
+; ALL-LABEL: testlowercall9:
+; ALL-DAG:       addiu $4, $zero, 32
+; ALL-DAG:       addiu $5, $zero, 63
+; ALL-DAG:       addiu $6, $zero, 54
+; ALL-DAG:       lui $7, 16688
 define void @testlowercall9() nounwind {
 entry:
   tail call void @f9(i32 32, i32 63, i32 54, float 1.100000e+01) nounwind
@@ -135,15 +136,16 @@ entry:
 declare void @f9(i32, i32, i32, float)
 
 ; $4, $5, ($6, $7)
-; CHECK-LABEL: testlowercall10:
-; CHECK-DAG: addiu $4, $zero, 42
-; CHECK-DAG: addiu $5, $zero, 73
-; FP32EL-LABEL: testlowercall10:
-; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
-; FP64EL-LABEL: testlowercall10:
-; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
+; ALL-LABEL: testlowercall10:
+
+; ALL-DAG:       addiu $4, $zero, 42
+; ALL-DAG:       addiu $5, $zero, 73
+
+; NO-MFHC1-DAG:  mfc1 $6, $f{{[0-9]+}}
+; NO-MFHC1-DAG:  mfc1 $7, $f{{[0-9]+}}
+
+; HAS-MFHC1-DAG: mfc1 $6, $f{{[0-9]+}}
+; HAS-MFHC1-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall10() nounwind {
 entry:
   tail call void @f10(i32 42, i32 73, double 2.700000e+01) nounwind
@@ -153,14 +155,14 @@ entry:
 declare void @f10(i32, i32, double)
 
 ; $4, ($6, $7)
-; CHECK-LABEL: testlowercall11:
-; CHECK-DAG: addiu $4, $zero, 52
-; FP32EL-LABEL: testlowercall11:
-; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
-; FP64EL-LABEL: testlowercall11:
-; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
+; ALL-LABEL: testlowercall11:
+; ALL-DAG:       addiu $4, $zero, 52
+
+; NO-MFHC1-DAG:  mfc1 $6, $f{{[0-9]+}}
+; NO-MFHC1-DAG:  mfc1 $7, $f{{[0-9]+}}
+
+; HAS-MFHC1-DAG: mfc1 $6, $f{{[0-9]+}}
+; HAS-MFHC1-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall11() nounwind {
 entry:
   tail call void @f11(i32 52, double 1.600000e+01) nounwind
@@ -170,11 +172,11 @@ entry:
 declare void @f11(i32, double)
 
 ; $f12, $f14, $6, $7
-; CHECK-LABEL: testlowercall12:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: lwc1 $f14, %lo
-; CHECK-DAG: lui $6, 16672
-; CHECK-DAG: lui $7, 16808
+; ALL-LABEL: testlowercall12:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       lwc1 $f14, %lo
+; ALL-DAG:       lui $6, 16672
+; ALL-DAG:       lui $7, 16808
 define void @testlowercall12() nounwind {
 entry:
   tail call void @f12(float 2.800000e+01, float 1.900000e+01, float 1.000000e+01, float 2.100000e+01) nounwind
@@ -184,11 +186,11 @@ entry:
 declare void @f12(float, float, float, float)
 
 ; $f12, $5, $6, $7
-; CHECK-LABEL: testlowercall13:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: addiu $5, $zero, 83
-; CHECK-DAG: lui $6, 16800
-; CHECK-DAG: addiu $7, $zero, 25
+; ALL-LABEL: testlowercall13:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       addiu $5, $zero, 83
+; ALL-DAG:       lui $6, 16800
+; ALL-DAG:       addiu $7, $zero, 25
 define void @testlowercall13() nounwind {
 entry:
   tail call void @f13(float 3.800000e+01, i32 83, float 2.000000e+01, i32 25) nounwind
@@ -199,10 +201,10 @@ entry:
 declare void @f13(float, i32, float, i32)
 
 ; $f12, $f14, $7
-; CHECK-LABEL: testlowercall14:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: lwc1 $f14, %lo
-; CHECK-DAG: lui $7, 16880
+; ALL-LABEL: testlowercall14:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       lwc1 $f14, %lo
+; ALL-DAG:       lui $7, 16880
 define void @testlowercall14() nounwind {
 entry:
   tail call void @f14(double 3.500000e+01, float 2.900000e+01, float 3.000000e+01) nounwind
@@ -212,15 +214,15 @@ entry:
 declare void @f14(double, float, float)
 
 ; $f12, $f14, ($6, $7)
-; CHECK-LABEL: testlowercall15:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: lwc1 $f14, %lo
-; FP32EL-LABEL: testlowercall15:
-; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
-; FP64EL-LABEL: testlowercall15:
-; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
+; ALL-LABEL: testlowercall15:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       lwc1 $f14, %lo
+
+; NO-MFHC1-DAG:  mfc1 $6, $f{{[0-9]+}}
+; NO-MFHC1-DAG:  mfc1 $7, $f{{[0-9]+}}
+
+; HAS-MFHC1-DAG: mfc1 $6, $f{{[0-9]+}}
+; HAS-MFHC1-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall15() nounwind {
 entry:
   tail call void @f15(float 4.800000e+01, float 3.900000e+01, double 3.700000e+01) nounwind
@@ -230,11 +232,11 @@ entry:
 declare void @f15(float, float, double)
 
 ; $4, $5, $6, $7
-; CHECK-LABEL: testlowercall16:
-; CHECK-DAG: addiu $4, $zero, 62
-; CHECK-DAG: lui $5, 16964
-; CHECK-DAG: addiu $6, $zero, 64
-; CHECK-DAG: lui $7, 16888
+; ALL-LABEL: testlowercall16:
+; ALL-DAG:       addiu $4, $zero, 62
+; ALL-DAG:       lui $5, 16964
+; ALL-DAG:       addiu $6, $zero, 64
+; ALL-DAG:       lui $7, 16888
 define void @testlowercall16() nounwind {
 entry:
   tail call void @f16(i32 62, float 4.900000e+01, i32 64, float 3.100000e+01) nounwind
@@ -244,11 +246,11 @@ entry:
 declare void @f16(i32, float, i32, float)
 
 ; $4, $5, $6, $7
-; CHECK-LABEL: testlowercall17:
-; CHECK-DAG: addiu $4, $zero, 72
-; CHECK-DAG: lui $5, 17004
-; CHECK-DAG: addiu $6, $zero, 74
-; CHECK-DAG: addiu $7, $zero, 35
+; ALL-LABEL: testlowercall17:
+; ALL-DAG:       addiu $4, $zero, 72
+; ALL-DAG:       lui $5, 17004
+; ALL-DAG:       addiu $6, $zero, 74
+; ALL-DAG:       addiu $7, $zero, 35
 define void @testlowercall17() nounwind {
 entry:
   tail call void @f17(i32 72, float 5.900000e+01, i32 74, i32 35) nounwind
@@ -258,11 +260,11 @@ entry:
 declare void @f17(i32, float, i32, i32)
 
 ; $4, $5, $6, $7
-; CHECK-LABEL: testlowercall18:
-; CHECK-DAG: addiu $4, $zero, 82
-; CHECK-DAG: addiu $5, $zero, 93
-; CHECK-DAG: lui $6, 16928
-; CHECK-DAG: addiu $7, $zero, 45
+; ALL-LABEL: testlowercall18:
+; ALL-DAG:       addiu $4, $zero, 82
+; ALL-DAG:       addiu $5, $zero, 93
+; ALL-DAG:       lui $6, 16928
+; ALL-DAG:       addiu $7, $zero, 45
 define void @testlowercall18() nounwind {
 entry:
   tail call void @f18(i32 82, i32 93, float 4.000000e+01, i32 45) nounwind
@@ -273,16 +275,16 @@ declare void @f18(i32, i32, float, i32)
 
 
 ; $4, ($6, $7), stack
-; CHECK-LABEL: testlowercall20:
-; CHECK-DAG: addiu $4, $zero, 92
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
-; FP32EL-LABEL: testlowercall20:
-; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
-; FP64EL-LABEL: testlowercall20:
-; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
+; ALL-LABEL: testlowercall20:
+; ALL-DAG:       addiu $4, $zero, 92
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 16($sp)
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 20($sp)
+
+; NO-MFHC1-DAG:  mfc1 $6, $f{{[0-9]+}}
+; NO-MFHC1-DAG:  mfc1 $7, $f{{[0-9]+}}
+
+; HAS-MFHC1-DAG: mfc1 $6, $f{{[0-9]+}}
+; HAS-MFHC1-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall20() nounwind {
 entry:
   tail call void @f20(i32 92, double 2.600000e+01, double 4.700000e+01) nounwind
@@ -292,9 +294,9 @@ entry:
 declare void @f20(i32, double, double)
 
 ; $f12, $5
-; CHECK-LABEL: testlowercall21:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: addiu $5, $zero, 103
+; ALL-LABEL: testlowercall21:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       addiu $5, $zero, 103
 define void @testlowercall21() nounwind {
 entry:
   tail call void @f21(float 5.800000e+01, i32 103) nounwind
@@ -304,15 +306,15 @@ entry:
 declare void @f21(float, i32)
 
 ; $f12, $5, ($6, $7)
-; CHECK-LABEL: testlowercall22:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: addiu $5, $zero, 113
-; FP32EL-LABEL: testlowercall22:
-; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
-; FP64EL-LABEL: testlowercall22:
-; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
+; ALL-LABEL: testlowercall22:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       addiu $5, $zero, 113
+
+; NO-MFHC1-DAG:  mfc1 $6, $f{{[0-9]+}}
+; NO-MFHC1-DAG:  mfc1 $7, $f{{[0-9]+}}
+
+; HAS-MFHC1-DAG: mfc1 $6, $f{{[0-9]+}}
+; HAS-MFHC1-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall22() nounwind {
 entry:
   tail call void @f22(float 6.800000e+01, i32 113, double 5.700000e+01) nounwind
@@ -322,9 +324,9 @@ entry:
 declare void @f22(float, i32, double)
 
 ; $f12, f6
-; CHECK-LABEL: testlowercall23:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: addiu $6, $zero, 123
+; ALL-LABEL: testlowercall23:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       addiu $6, $zero, 123
 define void @testlowercall23() nounwind {
 entry:
   tail call void @f23(double 4.500000e+01, i32 123) nounwind
@@ -334,11 +336,11 @@ entry:
 declare void @f23(double, i32)
 
 ; $f12,$6, stack
-; CHECK-LABEL: testlowercall24:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: addiu $6, $zero, 133
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
+; ALL-LABEL: testlowercall24:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       addiu $6, $zero, 133
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 16($sp)
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 20($sp)
 define void @testlowercall24() nounwind {
 entry:
   tail call void @f24(double 5.500000e+01, i32 133, double 6.700000e+01) nounwind
@@ -347,19 +349,19 @@ entry:
 
 declare void @f24(double, i32, double)
 
-; CHECK-LABEL: testlowercall25:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: lwc1 $f14, %lo
-; CHECK-DAG: lui $6
-; CHECK-DAG: lui $7
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: addiu $5, $zero, 83
-; CHECK-DAG: lui $6
-; CHECK-DAG: addiu $7, $zero, 25
-; CHECK-DAG: addiu $4, $zero, 82
-; CHECK-DAG: addiu $5, $zero, 93
-; CHECK-DAG: lui $6
-; CHECK-DAG: addiu $7, $zero, 45
+; ALL-LABEL: testlowercall25:
+; ALL-DAG:      lwc1 $f12, %lo
+; ALL-DAG:      lwc1 $f14, %lo
+; ALL-DAG:      lui $6
+; ALL-DAG:      lui $7
+; ALL-DAG:      lwc1 $f12, %lo
+; ALL-DAG:      addiu $5, $zero, 83
+; ALL-DAG:      lui $6
+; ALL-DAG:      addiu $7, $zero, 25
+; ALL-DAG:      addiu $4, $zero, 82
+; ALL-DAG:      addiu $5, $zero, 93
+; ALL-DAG:      lui $6
+; ALL-DAG:      addiu $7, $zero, 45
 define void @testlowercall25() nounwind {
 entry:
   tail call void @f12(float 2.800000e+01, float 1.900000e+01, float 1.000000e+01, float 2.100000e+01) nounwind
diff --git a/test/CodeGen/Mips/octeon_popcnt.ll b/test/CodeGen/Mips/octeon_popcnt.ll
index 52c37f6..3432b39 100644
--- a/test/CodeGen/Mips/octeon_popcnt.ll
+++ b/test/CodeGen/Mips/octeon_popcnt.ll
@@ -6,7 +6,7 @@ define i8 @cnt8(i8 %x) nounwind readnone {
   ret i8 %cnt
 ; OCTEON-LABEL: cnt8:
 ; OCTEON: jr   $ra
-; OCTEON: pop  $2, $1
+; OCTEON: pop  $2, [[R1:\$[0-9]+]]
 ; MIPS64-LABEL: cnt8:
 ; MIPS64-NOT: pop
 }
@@ -16,12 +16,12 @@ define i16 @cnt16(i16 %x) nounwind readnone {
   ret i16 %cnt
 ; OCTEON-LABEL: cnt16:
 ; OCTEON: jr   $ra
-; OCTEON: pop  $2, $1
+; OCTEON: pop  $2, [[R1:\$[0-9]+]]
 ; MIPS64-LABEL: cnt16:
 ; MIPS64-NOT: pop
 }
 
-define i32 @cnt32(i32 %x) nounwind readnone {
+define i32 @cnt32(i32 zeroext %x) nounwind readnone {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
 ; OCTEON-LABEL: cnt32:
diff --git a/test/CodeGen/Mips/prevent-hoisting.ll b/test/CodeGen/Mips/prevent-hoisting.ll
index da665c2..210fe3b 100644
--- a/test/CodeGen/Mips/prevent-hoisting.ll
+++ b/test/CodeGen/Mips/prevent-hoisting.ll
@@ -10,16 +10,19 @@
 
 ; CHECK-LABEL: readLumaCoeff8x8_CABAC
 
-; The check for "addiu" instruction is added so that we can match the correct "b" instruction.
+; The check for first "addiu" instruction is added so that we can match the correct "b" instruction.
 ; CHECK:           addiu ${{[0-9]+}}, $zero, -1
 ; CHECK:           b $[[BB0:BB[0-9_]+]]
+; CHECK-NEXT:      addiu ${{[0-9]+}}, $zero, 0
 
-; Check that sll instruction that writes to $1 starts basic block.
-; CHECK:       {{BB[0-9_#]+}}: 
+; Check that at the start of a fallthrough block there is a instruction that writes to $1.
+; CHECK-NEXT:  {{BB[0-9_#]+}}: 
+; CHECK-NEXT:      lw      $[[R1:[0-9]+]], %got(assignSE2partition)($[[R2:[0-9]+]])
 ; CHECK-NEXT:      sll $1, $[[R0:[0-9]+]], 4
 
-; Check that identical sll instruction starts another basic block.
+; Check that identical instructions are at the start of a target block.
 ; CHECK:       [[BB0]]:
+; CHECK-NEXT:      lw      $[[R1]], %got(assignSE2partition)($[[R2]])
 ; CHECK-NEXT:      sll $1, $[[R0]], 4
 
 
diff --git a/test/CodeGen/Mips/select.ll b/test/CodeGen/Mips/select.ll
index eb2198b..d6e1826 100644
--- a/test/CodeGen/Mips/select.ll
+++ b/test/CodeGen/Mips/select.ll
@@ -8,7 +8,7 @@
 @d2 = external global double
 @d3 = external global double
 
-define i32 @i32_icmp_ne_i32_val(i32 %s, i32 %f0, i32 %f1) nounwind readnone {
+define i32 @i32_icmp_ne_i32_val(i32 signext %s, i32 signext %f0, i32 signext %f1) nounwind readnone {
 entry:
 ; ALL-LABEL: i32_icmp_ne_i32_val:
 
@@ -37,7 +37,7 @@ entry:
   ret i32 %cond
 }
 
-define i64 @i32_icmp_ne_i64_val(i32 %s, i64 %f0, i64 %f1) nounwind readnone {
+define i64 @i32_icmp_ne_i64_val(i32 signext %s, i64 %f0, i64 %f1) nounwind readnone {
 entry:
 ; ALL-LABEL: i32_icmp_ne_i64_val:
 
@@ -128,7 +128,7 @@ entry:
   ret i64 %cond
 }
 
-define float @i32_icmp_ne_f32_val(i32 %s, float %f0, float %f1) nounwind readnone {
+define float @i32_icmp_ne_f32_val(i32 signext %s, float %f0, float %f1) nounwind readnone {
 entry:
 ; ALL-LABEL: i32_icmp_ne_f32_val:
 
@@ -161,7 +161,7 @@ entry:
   ret float %cond
 }
 
-define double @i32_icmp_ne_f64_val(i32 %s, double %f0, double %f1) nounwind readnone {
+define double @i32_icmp_ne_f64_val(i32 signext %s, double %f0, double %f1) nounwind readnone {
 entry:
 ; ALL-LABEL: i32_icmp_ne_f64_val:
 
@@ -496,7 +496,7 @@ entry:
   ret float %cond
 }
 
-define i32 @f32_fcmp_oeq_i32_val(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
+define i32 @f32_fcmp_oeq_i32_val(i32 signext %f0, i32 signext %f1, float %f2, float %f3) nounwind readnone {
 entry:
 ; ALL-LABEL: f32_fcmp_oeq_i32_val:
 
@@ -541,7 +541,7 @@ entry:
   ret i32 %cond
 }
 
-define i32 @f32_fcmp_olt_i32_val(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
+define i32 @f32_fcmp_olt_i32_val(i32 signext %f0, i32 signext %f1, float %f2, float %f3) nounwind readnone {
 entry:
 ; ALL-LABEL: f32_fcmp_olt_i32_val:
 
@@ -585,7 +585,7 @@ entry:
   ret i32 %cond
 }
 
-define i32 @f32_fcmp_ogt_i32_val(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
+define i32 @f32_fcmp_ogt_i32_val(i32 signext %f0, i32 signext %f1, float %f2, float %f3) nounwind readnone {
 entry:
 ; ALL-LABEL: f32_fcmp_ogt_i32_val:
 
@@ -630,7 +630,7 @@ entry:
   ret i32 %cond
 }
 
-define i32 @f64_fcmp_oeq_i32_val(i32 %f0, i32 %f1) nounwind readonly {
+define i32 @f64_fcmp_oeq_i32_val(i32 signext %f0, i32 signext %f1) nounwind readonly {
 entry:
 ; ALL-LABEL: f64_fcmp_oeq_i32_val:
 
@@ -707,7 +707,7 @@ entry:
   ret i32 %cond
 }
 
-define i32 @f64_fcmp_olt_i32_val(i32 %f0, i32 %f1) nounwind readonly {
+define i32 @f64_fcmp_olt_i32_val(i32 signext %f0, i32 signext %f1) nounwind readonly {
 entry:
 ; ALL-LABEL: f64_fcmp_olt_i32_val:
 
@@ -784,7 +784,7 @@ entry:
   ret i32 %cond
 }
 
-define i32 @f64_fcmp_ogt_i32_val(i32 %f0, i32 %f1) nounwind readonly {
+define i32 @f64_fcmp_ogt_i32_val(i32 signext %f0, i32 signext %f1) nounwind readonly {
 entry:
 ; ALL-LABEL: f64_fcmp_ogt_i32_val:
 
diff --git a/test/CodeGen/Mips/seleq.ll b/test/CodeGen/Mips/seleq.ll
index 190baad..9af422f 100644
--- a/test/CodeGen/Mips/seleq.ll
+++ b/test/CodeGen/Mips/seleq.ll
@@ -10,7 +10,7 @@
 @z3 = common global i32 0, align 4
 @z4 = common global i32 0, align 4
 
-define void @calc_seleq() nounwind "target-cpu"="mips32" "target-features"="+o32,+mips32" {
+define void @calc_seleq() nounwind {
 entry:
   %0 = load i32* @a, align 4
   %1 = load i32* @b, align 4
diff --git a/test/CodeGen/Mips/small-section-reserve-gp.ll b/test/CodeGen/Mips/small-section-reserve-gp.ll
index 03503fb..cbf0681 100644
--- a/test/CodeGen/Mips/small-section-reserve-gp.ll
+++ b/test/CodeGen/Mips/small-section-reserve-gp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-sde-elf -march=mipsel -relocation-model=static < %s \
+; RUN: llc -mtriple=mipsel-sde-elf -march=mipsel -relocation-model=static -mattr=+noabicalls -mgpopt < %s \
 ; RUN: | FileCheck %s
 
 @i = internal unnamed_addr global i32 0, align 4
diff --git a/test/CodeGen/Mips/start-asm-file.ll b/test/CodeGen/Mips/start-asm-file.ll
index 8872464..9dc501c 100644
--- a/test/CodeGen/Mips/start-asm-file.ll
+++ b/test/CodeGen/Mips/start-asm-file.ll
@@ -1,7 +1,4 @@
 ; Check the emission of directives at the start of an asm file.
-; This test is XFAILED until we fix the emission of '.option pic0' on
-; N32. At the moment we check if subtarget is Mips64 when we should be
-; checking the Subtarget's ABI.
 
 ; ### O32 ABI ###
 ; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
diff --git a/test/CodeGen/Mips/zeroreg.ll b/test/CodeGen/Mips/zeroreg.ll
index a1b6cb0..c766d3b 100644
--- a/test/CodeGen/Mips/zeroreg.ll
+++ b/test/CodeGen/Mips/zeroreg.ll
@@ -8,7 +8,7 @@
 
 @g1 = external global i32
 
-define i32 @sel_icmp_nez_i32_z0(i32 %s) nounwind readonly {
+define i32 @sel_icmp_nez_i32_z0(i32 signext %s) nounwind readonly {
 entry:
 ; ALL-LABEL: sel_icmp_nez_i32_z0:
 
@@ -30,7 +30,7 @@ entry:
   ret i32 %cond
 }
 
-define i32 @sel_icmp_nez_i32_z1(i32 %s) nounwind readonly {
+define i32 @sel_icmp_nez_i32_z1(i32 signext %s) nounwind readonly {
 entry:
 ; ALL-LABEL: sel_icmp_nez_i32_z1:
 
diff --git a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
index e474fa4..c167db4 100644
--- a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
+++ b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s
 
 ;; These tests should run for all targets
 
@@ -9,28 +9,28 @@
 ;;; f64
 
 define double @fadd_f64(double %a, double %b) {
-; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: add.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
 ; CHECK: ret
   %ret = fadd double %a, %b
   ret double %ret
 }
 
 define double @fsub_f64(double %a, double %b) {
-; CHECK: sub.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: sub.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
 ; CHECK: ret
   %ret = fsub double %a, %b
   ret double %ret
 }
 
 define double @fmul_f64(double %a, double %b) {
-; CHECK: mul.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: mul.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
 ; CHECK: ret
   %ret = fmul double %a, %b
   ret double %ret
 }
 
 define double @fdiv_f64(double %a, double %b) {
-; CHECK: div.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: div.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
 ; CHECK: ret
   %ret = fdiv double %a, %b
   ret double %ret
diff --git a/test/CodeGen/NVPTX/arithmetic-int.ll b/test/CodeGen/NVPTX/arithmetic-int.ll
index 8d73b7e..b5a2872 100644
--- a/test/CodeGen/NVPTX/arithmetic-int.ll
+++ b/test/CodeGen/NVPTX/arithmetic-int.ll
@@ -9,70 +9,70 @@
 ;;; i64
 
 define i64 @add_i64(i64 %a, i64 %b) {
-; CHECK: add.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = add i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @sub_i64(i64 %a, i64 %b) {
-; CHECK: sub.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: sub.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = sub i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @mul_i64(i64 %a, i64 %b) {
-; CHECK: mul.lo.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: mul.lo.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = mul i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @sdiv_i64(i64 %a, i64 %b) {
-; CHECK: div.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: div.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = sdiv i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @udiv_i64(i64 %a, i64 %b) {
-; CHECK: div.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: div.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = udiv i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @srem_i64(i64 %a, i64 %b) {
-; CHECK: rem.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: rem.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = srem i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @urem_i64(i64 %a, i64 %b) {
-; CHECK: rem.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: rem.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = urem i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @and_i64(i64 %a, i64 %b) {
-; CHECK: and.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: and.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = and i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @or_i64(i64 %a, i64 %b) {
-; CHECK: or.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: or.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = or i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @xor_i64(i64 %a, i64 %b) {
-; CHECK: xor.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: xor.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = xor i64 %a, %b
   ret i64 %ret
@@ -80,7 +80,7 @@ define i64 @xor_i64(i64 %a, i64 %b) {
 
 define i64 @shl_i64(i64 %a, i64 %b) {
 ; PTX requires 32-bit shift amount
-; CHECK: shl.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: shl.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %ret = shl i64 %a, %b
   ret i64 %ret
@@ -88,7 +88,7 @@ define i64 @shl_i64(i64 %a, i64 %b) {
 
 define i64 @ashr_i64(i64 %a, i64 %b) {
 ; PTX requires 32-bit shift amount
-; CHECK: shr.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: shr.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %ret = ashr i64 %a, %b
   ret i64 %ret
@@ -96,7 +96,7 @@ define i64 @ashr_i64(i64 %a, i64 %b) {
 
 define i64 @lshr_i64(i64 %a, i64 %b) {
 ; PTX requires 32-bit shift amount
-; CHECK: shr.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: shr.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %ret = lshr i64 %a, %b
   ret i64 %ret
diff --git a/test/CodeGen/NVPTX/atomics.ll b/test/CodeGen/NVPTX/atomics.ll
index 10ab73d..daadb6e 100644
--- a/test/CodeGen/NVPTX/atomics.ll
+++ b/test/CodeGen/NVPTX/atomics.ll
@@ -1,21 +1,21 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 
 
-; CHECK: atom0
+; CHECK-LABEL: atom0
 define i32 @atom0(i32* %addr, i32 %val) {
 ; CHECK: atom.add.u32
   %ret = atomicrmw add i32* %addr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom1
+; CHECK-LABEL: atom1
 define i64 @atom1(i64* %addr, i64 %val) {
 ; CHECK: atom.add.u64
   %ret = atomicrmw add i64* %addr, i64 %val seq_cst
   ret i64 %ret
 }
 
-; CHECK: atom2
+; CHECK-LABEL: atom2
 define i32 @atom2(i32* %subr, i32 %val) {
 ; CHECK: neg.s32
 ; CHECK: atom.add.u32
@@ -23,7 +23,7 @@ define i32 @atom2(i32* %subr, i32 %val) {
   ret i32 %ret
 }
 
-; CHECK: atom3
+; CHECK-LABEL: atom3
 define i64 @atom3(i64* %subr, i64 %val) {
 ; CHECK: neg.s64
 ; CHECK: atom.add.u64
@@ -31,14 +31,14 @@ define i64 @atom3(i64* %subr, i64 %val) {
   ret i64 %ret
 }
 
-; CHECK: atom4
+; CHECK-LABEL: atom4
 define i32 @atom4(i32* %subr, i32 %val) {
 ; CHECK: atom.and.b32
   %ret = atomicrmw and i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom5
+; CHECK-LABEL: atom5
 define i64 @atom5(i64* %subr, i64 %val) {
 ; CHECK: atom.and.b64
   %ret = atomicrmw and i64* %subr, i64 %val seq_cst
@@ -56,86 +56,127 @@ define i64 @atom5(i64* %subr, i64 %val) {
 ;  ret i64 %ret
 ;}
 
-; CHECK: atom8
+; CHECK-LABEL: atom8
 define i32 @atom8(i32* %subr, i32 %val) {
 ; CHECK: atom.or.b32
   %ret = atomicrmw or i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom9
+; CHECK-LABEL: atom9
 define i64 @atom9(i64* %subr, i64 %val) {
 ; CHECK: atom.or.b64
   %ret = atomicrmw or i64* %subr, i64 %val seq_cst
   ret i64 %ret
 }
 
-; CHECK: atom10
+; CHECK-LABEL: atom10
 define i32 @atom10(i32* %subr, i32 %val) {
 ; CHECK: atom.xor.b32
   %ret = atomicrmw xor i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom11
+; CHECK-LABEL: atom11
 define i64 @atom11(i64* %subr, i64 %val) {
 ; CHECK: atom.xor.b64
   %ret = atomicrmw xor i64* %subr, i64 %val seq_cst
   ret i64 %ret
 }
 
-; CHECK: atom12
+; CHECK-LABEL: atom12
 define i32 @atom12(i32* %subr, i32 %val) {
 ; CHECK: atom.max.s32
   %ret = atomicrmw max i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom13
+; CHECK-LABEL: atom13
 define i64 @atom13(i64* %subr, i64 %val) {
 ; CHECK: atom.max.s64
   %ret = atomicrmw max i64* %subr, i64 %val seq_cst
   ret i64 %ret
 }
 
-; CHECK: atom14
+; CHECK-LABEL: atom14
 define i32 @atom14(i32* %subr, i32 %val) {
 ; CHECK: atom.min.s32
   %ret = atomicrmw min i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom15
+; CHECK-LABEL: atom15
 define i64 @atom15(i64* %subr, i64 %val) {
 ; CHECK: atom.min.s64
   %ret = atomicrmw min i64* %subr, i64 %val seq_cst
   ret i64 %ret
 }
 
-; CHECK: atom16
+; CHECK-LABEL: atom16
 define i32 @atom16(i32* %subr, i32 %val) {
 ; CHECK: atom.max.u32
   %ret = atomicrmw umax i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom17
+; CHECK-LABEL: atom17
 define i64 @atom17(i64* %subr, i64 %val) {
 ; CHECK: atom.max.u64
   %ret = atomicrmw umax i64* %subr, i64 %val seq_cst
   ret i64 %ret
 }
 
-; CHECK: atom18
+; CHECK-LABEL: atom18
 define i32 @atom18(i32* %subr, i32 %val) {
 ; CHECK: atom.min.u32
   %ret = atomicrmw umin i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom19
+; CHECK-LABEL: atom19
 define i64 @atom19(i64* %subr, i64 %val) {
 ; CHECK: atom.min.u64
   %ret = atomicrmw umin i64* %subr, i64 %val seq_cst
   ret i64 %ret
 }
+
+declare float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %addr, float %val)
+
+; CHECK-LABEL: atomic_add_f32_generic
+define float @atomic_add_f32_generic(float* %addr, float %val) {
+; CHECK: atom.add.f32
+  %ret = call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %addr, float %val)
+  ret float %ret
+}
+
+declare float @llvm.nvvm.atomic.load.add.f32.p1f32(float addrspace(1)* %addr, float %val)
+
+; CHECK-LABEL: atomic_add_f32_addrspace1
+define float @atomic_add_f32_addrspace1(float addrspace(1)* %addr, float %val) {
+; CHECK: atom.global.add.f32
+  %ret = call float @llvm.nvvm.atomic.load.add.f32.p1f32(float addrspace(1)* %addr, float %val)
+  ret float %ret
+}
+
+declare float @llvm.nvvm.atomic.load.add.f32.p3f32(float addrspace(3)* %addr, float %val)
+
+; CHECK-LABEL: atomic_add_f32_addrspace3
+define float @atomic_add_f32_addrspace3(float addrspace(3)* %addr, float %val) {
+; CHECK: atom.shared.add.f32
+  %ret = call float @llvm.nvvm.atomic.load.add.f32.p3f32(float addrspace(3)* %addr, float %val)
+  ret float %ret
+}
+
+; CHECK-LABEL: atomic_cmpxchg_i32
+define i32 @atomic_cmpxchg_i32(i32* %addr, i32 %cmp, i32 %new) {
+; CHECK: atom.cas.b32
+  %pairold = cmpxchg i32* %addr, i32 %cmp, i32 %new seq_cst seq_cst
+  ret i32 %new
+}
+
+; CHECK-LABEL: atomic_cmpxchg_i64
+define i64 @atomic_cmpxchg_i64(i64* %addr, i64 %cmp, i64 %new) {
+; CHECK: atom.cas.b64
+  %pairold = cmpxchg i64* %addr, i64 %cmp, i64 %new seq_cst seq_cst
+  ret i64 %new
+}
diff --git a/test/CodeGen/NVPTX/bug21465.ll b/test/CodeGen/NVPTX/bug21465.ll
new file mode 100644
index 0000000..157b28c
--- /dev/null
+++ b/test/CodeGen/NVPTX/bug21465.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -nvptx-lower-struct-args -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+%struct.S = type { i32, i32 }
+
+; Function Attrs: nounwind
+define void @_Z11TakesStruct1SPi(%struct.S* byval nocapture readonly %input, i32* nocapture %output) #0 {
+entry:
+; CHECK-LABEL @_Z22TakesStruct1SPi
+; CHECK:   bitcast %struct.S* %input to i8*
+; CHECK:   call i8 addrspace(101)* @llvm.nvvm.ptr.gen.to.param.p101i8.p0i8
+  %b = getelementptr inbounds %struct.S* %input, i64 0, i32 1
+  %0 = load i32* %b, align 4
+  store i32 %0, i32* %output, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!nvvm.annotations = !{!0}
+
+!0 = metadata !{void (%struct.S*, i32*)* @_Z11TakesStruct1SPi, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 28dfa46..83d4916 100644
--- a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -20,11 +20,11 @@ entry:
   %buf = alloca [16 x i8], align 4
 
 ; CHECK: .local .align 4 .b8 	__local_depot0[16]
-; CHECK: mov.u64 %rl[[BUF_REG:[0-9]+]]
-; CHECK: cvta.local.u64 %SP, %rl[[BUF_REG]]
+; CHECK: mov.u64 %rd[[BUF_REG:[0-9]+]]
+; CHECK: cvta.local.u64 %SP, %rd[[BUF_REG]]
 
-; CHECK: ld.param.u64 %rl[[A_REG:[0-9]+]], [kernel_func_param_0]
-; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rl[[A_REG]]]
+; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
+; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rd[[A_REG]]]
 ; CHECK: st.f32 [%SP+0], %f[[A0_REG]]
 
   %0 = load float* %a, align 4
@@ -46,11 +46,11 @@ entry:
   %7 = bitcast i8* %arrayidx7 to float*
   store float %6, float* %7, align 4
 
-; CHECK: add.u64 %rl[[SP_REG:[0-9]+]], %SP, 0
+; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0
 ; CHECK:        .param .b64 param0;
-; CHECK-NEXT:   st.param.b64  [param0+0], %rl[[A_REG]]
+; CHECK-NEXT:   st.param.b64  [param0+0], %rd[[A_REG]]
 ; CHECK-NEXT:   .param .b64 param1;
-; CHECK-NEXT:   st.param.b64  [param1+0], %rl[[SP_REG]]
+; CHECK-NEXT:   st.param.b64  [param1+0], %rd[[SP_REG]]
 ; CHECK-NEXT:   call.uni
 ; CHECK-NEXT:   callee,
 
diff --git a/test/CodeGen/NVPTX/compare-int.ll b/test/CodeGen/NVPTX/compare-int.ll
index c595f21..e4e0601 100644
--- a/test/CodeGen/NVPTX/compare-int.ll
+++ b/test/CodeGen/NVPTX/compare-int.ll
@@ -9,8 +9,8 @@
 ;;; i64
 
 define i64 @icmp_eq_i64(i64 %a, i64 %b) {
-; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp eq i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -18,8 +18,8 @@ define i64 @icmp_eq_i64(i64 %a, i64 %b) {
 }
 
 define i64 @icmp_ne_i64(i64 %a, i64 %b) {
-; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp ne i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -27,8 +27,8 @@ define i64 @icmp_ne_i64(i64 %a, i64 %b) {
 }
 
 define i64 @icmp_ugt_i64(i64 %a, i64 %b) {
-; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp ugt i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -36,8 +36,8 @@ define i64 @icmp_ugt_i64(i64 %a, i64 %b) {
 }
 
 define i64 @icmp_uge_i64(i64 %a, i64 %b) {
-; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp uge i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -45,8 +45,8 @@ define i64 @icmp_uge_i64(i64 %a, i64 %b) {
 }
 
 define i64 @icmp_ult_i64(i64 %a, i64 %b) {
-; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp ult i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -54,8 +54,8 @@ define i64 @icmp_ult_i64(i64 %a, i64 %b) {
 }
 
 define i64 @icmp_ule_i64(i64 %a, i64 %b) {
-; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp ule i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -63,8 +63,8 @@ define i64 @icmp_ule_i64(i64 %a, i64 %b) {
 }
 
 define i64 @icmp_sgt_i64(i64 %a, i64 %b) {
-; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp sgt i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -72,8 +72,8 @@ define i64 @icmp_sgt_i64(i64 %a, i64 %b) {
 }
 
 define i64 @icmp_sge_i64(i64 %a, i64 %b) {
-; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp sge i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -81,8 +81,8 @@ define i64 @icmp_sge_i64(i64 %a, i64 %b) {
 }
 
 define i64 @icmp_slt_i64(i64 %a, i64 %b) {
-; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp slt i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -90,8 +90,8 @@ define i64 @icmp_slt_i64(i64 %a, i64 %b) {
 }
 
 define i64 @icmp_sle_i64(i64 %a, i64 %b) {
-; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp sle i64 %a, %b
   %ret = zext i1 %cmp to i64
diff --git a/test/CodeGen/NVPTX/convert-fp.ll b/test/CodeGen/NVPTX/convert-fp.ll
index 1882121..4b5446e 100644
--- a/test/CodeGen/NVPTX/convert-fp.ll
+++ b/test/CodeGen/NVPTX/convert-fp.ll
@@ -10,7 +10,7 @@ define i16 @cvt_i16_f32(float %x) {
 }
 
 define i16 @cvt_i16_f64(double %x) {
-; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui double %x to i16
   ret i16 %a
@@ -24,7 +24,7 @@ define i32 @cvt_i32_f32(float %x) {
 }
 
 define i32 @cvt_i32_f64(double %x) {
-; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui double %x to i32
   ret i32 %a
@@ -32,14 +32,14 @@ define i32 @cvt_i32_f64(double %x) {
 
 
 define i64 @cvt_i64_f32(float %x) {
-; CHECK: cvt.rzi.u64.f32 %rl{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.rzi.u64.f32 %rd{{[0-9]+}}, %f{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui float %x to i64
   ret i64 %a
 }
 
 define i64 @cvt_i64_f64(double %x) {
-; CHECK: cvt.rzi.u64.f64 %rl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: cvt.rzi.u64.f64 %rd{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui double %x to i64
   ret i64 %a
@@ -60,14 +60,14 @@ define float @cvt_f32_i32(i32 %x) {
 }
 
 define float @cvt_f32_i64(i64 %x) {
-; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rl{{[0-9]+}};
+; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i64 %x to float
   ret float %a
 }
 
 define float @cvt_f32_f64(double %x) {
-; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptrunc double %x to float
   ret float %a
@@ -88,56 +88,56 @@ define float @cvt_f32_s32(i32 %x) {
 }
 
 define float @cvt_f32_s64(i64 %x) {
-; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i64 %x to float
   ret float %a
 }
 
 define double @cvt_f64_i16(i16 %x) {
-; CHECK: cvt.rn.f64.u16 %fl{{[0-9]+}}, %rs{{[0-9]+}};
+; CHECK: cvt.rn.f64.u16 %fd{{[0-9]+}}, %rs{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i16 %x to double
   ret double %a
 }
 
 define double @cvt_f64_i32(i32 %x) {
-; CHECK: cvt.rn.f64.u32 %fl{{[0-9]+}}, %r{{[0-9]+}};
+; CHECK: cvt.rn.f64.u32 %fd{{[0-9]+}}, %r{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i32 %x to double
   ret double %a
 }
 
 define double @cvt_f64_i64(i64 %x) {
-; CHECK: cvt.rn.f64.u64 %fl{{[0-9]+}}, %rl{{[0-9]+}};
+; CHECK: cvt.rn.f64.u64 %fd{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i64 %x to double
   ret double %a
 }
 
 define double @cvt_f64_f32(float %x) {
-; CHECK: cvt.f64.f32 %fl{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.f64.f32 %fd{{[0-9]+}}, %f{{[0-9]+}};
 ; CHECK: ret;
   %a = fpext float %x to double
   ret double %a
 }
 
 define double @cvt_f64_s16(i16 %x) {
-; CHECK: cvt.rn.f64.s16 %fl{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: cvt.rn.f64.s16 %fd{{[0-9]+}}, %rs{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i16 %x to double
   ret double %a
 }
 
 define double @cvt_f64_s32(i32 %x) {
-; CHECK: cvt.rn.f64.s32 %fl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: cvt.rn.f64.s32 %fd{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i32 %x to double
   ret double %a
 }
 
 define double @cvt_f64_s64(i64 %x) {
-; CHECK: cvt.rn.f64.s64 %fl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: cvt.rn.f64.s64 %fd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i64 %x to double
   ret double %a
diff --git a/test/CodeGen/NVPTX/convert-int-sm20.ll b/test/CodeGen/NVPTX/convert-int-sm20.ll
index 227cd31..57a2316 100644
--- a/test/CodeGen/NVPTX/convert-int-sm20.ll
+++ b/test/CodeGen/NVPTX/convert-int-sm20.ll
@@ -48,16 +48,16 @@ define i32 @cvt_i32_i64(i64 %x) {
 ; i64
 
 define i64 @cvt_i64_i16(i16 %x) {
-; CHECK: ld.param.u16 %rl[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}]
-; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]]
+; CHECK: ld.param.u16 %rd[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}]
+; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rd[[R0]]
 ; CHECK: ret
   %a = zext i16 %x to i64
   ret i64 %a
 }
 
 define i64 @cvt_i64_i32(i32 %x) {
-; CHECK: ld.param.u32 %rl[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}]
-; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]]
+; CHECK: ld.param.u32 %rd[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}]
+; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rd[[R0]]
 ; CHECK: ret
   %a = zext i32 %x to i64
   ret i64 %a
diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll
index 4ef1a9a..14b5c45 100644
--- a/test/CodeGen/NVPTX/fma.ll
+++ b/test/CodeGen/NVPTX/fma.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
 
 define ptx_device float @t1_f32(float %x, float %y, float %z) {
 ; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
@@ -9,7 +9,7 @@ define ptx_device float @t1_f32(float %x, float %y, float %z) {
 }
 
 define ptx_device double @t1_f64(double %x, double %y, double %z) {
-; CHECK: fma.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
   %a = fmul double %x, %y
   %b = fadd double %a, %z
diff --git a/test/CodeGen/NVPTX/fp-contract.ll b/test/CodeGen/NVPTX/fp-contract.ll
new file mode 100644
index 0000000..3f68b18
--- /dev/null
+++ b/test/CodeGen/NVPTX/fp-contract.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
+
+target triple = "nvptx64-unknown-cuda"
+
+;; Make sure we are generating proper instruction sequences for fused ops
+;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
+;; add.f32 otherwise.  Without an explicit rounding mode on add.f32, ptxas
+;; is free to fuse with a multiply if it is able.  If fusion is not allowed,
+;; we do not form fma.rn at the PTX level and explicitly generate add.rn
+;; for all adds to prevent ptxas from fusion the ops.
+
+;; FAST-LABEL: @t0
+;; DEFAULT-LABEL: @t0
+define float @t0(float %a, float %b, float %c) {
+;; FAST: fma.rn.f32
+;; DEFAULT: mul.rn.f32
+;; DEFAULT: add.rn.f32
+  %v0 = fmul float %a, %b
+  %v1 = fadd float %v0, %c
+  ret float %v1
+}
+
+;; FAST-LABEL: @t1
+;; DEFAULT-LABEL: @t1
+define float @t1(float %a, float %b) {
+;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
+;; to prevent ptxas from fusing this with anything else.
+;; FAST: add.f32
+;; DEFAULT: add.rn.f32
+  %v1 = fadd float %a, %b
+  ret float %v1
+}
diff --git a/test/CodeGen/NVPTX/fp-literals.ll b/test/CodeGen/NVPTX/fp-literals.ll
index 0cc2413..755e0f9 100644
--- a/test/CodeGen/NVPTX/fp-literals.ll
+++ b/test/CodeGen/NVPTX/fp-literals.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+
+target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
 ; Make sure we can properly differentiate between single-precision and
 ; double-precision FP literals.
@@ -11,7 +14,7 @@ define float @myaddf(float %a) {
 }
 
 ; CHECK: myaddd
-; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, 0d3FF0000000000000
+; CHECK: add.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, 0d3FF0000000000000
 define double @myaddd(double %a) {
   %ret = fadd double %a, 1.0
   ret double %ret
diff --git a/test/CodeGen/NVPTX/fp16.ll b/test/CodeGen/NVPTX/fp16.ll
new file mode 100644
index 0000000..8770399
--- /dev/null
+++ b/test/CodeGen/NVPTX/fp16.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=nvptx -verify-machineinstrs < %s | FileCheck %s
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone
+
+; CHECK-LABEL: @test_convert_fp16_to_fp32
+; CHECK: cvt.f32.f16
+define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16 addrspace(1)* %in, align 2
+  %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; CHECK-LABEL: @test_convert_fp16_to_fp64
+; CHECK: cvt.f64.f16
+define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16 addrspace(1)* %in, align 2
+  %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
+  store double %cvt, double addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; CHECK-LABEL: @test_convert_fp32_to_fp16
+; CHECK: cvt.rn.f16.f32
+define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %val = load float addrspace(1)* %in, align 2
+  %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
+  store i16 %cvt, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; CHECK-LABEL: @test_convert_fp64_to_fp16
+; CHECK: cvt.rn.f16.f64
+define void @test_convert_fp64_to_fp16(i16 addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+  %val = load double addrspace(1)* %in, align 2
+  %cvt = call i16 @llvm.convert.to.fp16.f64(double %val) nounwind readnone
+  store i16 %cvt, i16 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/half.ll b/test/CodeGen/NVPTX/half.ll
new file mode 100644
index 0000000..aa08cc7
--- /dev/null
+++ b/test/CodeGen/NVPTX/half.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -march=nvptx | FileCheck %s
+
+define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: @test_load_store
+; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
+  %val = load half addrspace(1)* %in
+  store half %val, half addrspace(1) * %out
+  ret void
+}
+
+define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) {
+; CHECK-LABEL: @test_bitcast_from_half
+; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
+  %val = load half addrspace(1) * %in
+  %val_int = bitcast half %val to i16
+  store i16 %val_int, i16 addrspace(1)* %out
+  ret void
+}
+
+define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+; CHECK-LABEL: @test_bitcast_to_half
+; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
+  %val = load i16 addrspace(1)* %in
+  %val_fp = bitcast i16 %val to half
+  store half %val_fp, half addrspace(1)* %out
+  ret void
+}
+
+define void @test_extend32(half addrspace(1)* %in, float addrspace(1)* %out) {
+; CHECK-LABEL: @test_extend32
+; CHECK: cvt.f32.f16
+
+  %val16 = load half addrspace(1)* %in
+  %val32 = fpext half %val16 to float
+  store float %val32, float addrspace(1)* %out
+  ret void
+}
+
+define void @test_extend64(half addrspace(1)* %in, double addrspace(1)* %out) {
+; CHECK-LABEL: @test_extend64
+; CHECK: cvt.f64.f16
+
+  %val16 = load half addrspace(1)* %in
+  %val64 = fpext half %val16 to double
+  store double %val64, double addrspace(1)* %out
+  ret void
+}
+
+define void @test_trunc32(float addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: test_trunc32
+; CHECK: cvt.rn.f16.f32
+
+  %val32 = load float addrspace(1)* %in
+  %val16 = fptrunc float %val32 to half
+  store half %val16, half addrspace(1)* %out
+  ret void
+}
+
+define void @test_trunc64(double addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: @test_trunc64
+; CHECK: cvt.rn.f16.f64
+
+  %val32 = load double addrspace(1)* %in
+  %val16 = fptrunc double %val32 to half
+  store half %val16, half addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/implicit-def.ll b/test/CodeGen/NVPTX/implicit-def.ll
index 06d3d56..2d2c6e5 100644
--- a/test/CodeGen/NVPTX/implicit-def.ll
+++ b/test/CodeGen/NVPTX/implicit-def.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 -asm-verbose=1 | FileCheck %s
 
 ; CHECK: // implicit-def: %f[[F0:[0-9]+]]
-; CHECK: add.f32         %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
+; CHECK: add.rn.f32         %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
 define float @foo(float %a) {
   %ret = fadd float %a, undef
   ret float %ret
diff --git a/test/CodeGen/NVPTX/intrinsic-old.ll b/test/CodeGen/NVPTX/intrinsic-old.ll
index af91bb4..3c51776 100644
--- a/test/CodeGen/NVPTX/intrinsic-old.ll
+++ b/test/CodeGen/NVPTX/intrinsic-old.ll
@@ -198,7 +198,7 @@ define ptx_device i32 @test_clock() {
 }
 
 define ptx_device i64 @test_clock64() {
-; CHECK: mov.u64 %rl{{[0-9]+}}, %clock64;
+; CHECK: mov.u64 %rd{{[0-9]+}}, %clock64;
 ; CHECK: ret;
 	%x = call i64 @llvm.ptx.read.clock64()
 	ret i64 %x
diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll
index 78e1e77..34b671d 100644
--- a/test/CodeGen/NVPTX/intrinsics.ll
+++ b/test/CodeGen/NVPTX/intrinsics.ll
@@ -9,7 +9,7 @@ define ptx_device float @test_fabsf(float %f) {
 }
 
 define ptx_device double @test_fabs(double %d) {
-; CHECK: abs.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: abs.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
 	%x = call double @llvm.fabs.f64(double %d)
 	ret double %x
diff --git a/test/CodeGen/NVPTX/ld-addrspace.ll b/test/CodeGen/NVPTX/ld-addrspace.ll
index 133ef09..f33659c 100644
--- a/test/CodeGen/NVPTX/ld-addrspace.ll
+++ b/test/CodeGen/NVPTX/ld-addrspace.ll
@@ -6,7 +6,7 @@
 define i8 @ld_global_i8(i8 addrspace(1)* %ptr) {
 ; PTX32: ld.global.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i8 addrspace(1)* %ptr
   ret i8 %a
@@ -15,7 +15,7 @@ define i8 @ld_global_i8(i8 addrspace(1)* %ptr) {
 define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) {
 ; PTX32: ld.shared.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i8 addrspace(3)* %ptr
   ret i8 %a
@@ -24,7 +24,7 @@ define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) {
 define i8 @ld_local_i8(i8 addrspace(5)* %ptr) {
 ; PTX32: ld.local.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i8 addrspace(5)* %ptr
   ret i8 %a
@@ -34,7 +34,7 @@ define i8 @ld_local_i8(i8 addrspace(5)* %ptr) {
 define i16 @ld_global_i16(i16 addrspace(1)* %ptr) {
 ; PTX32: ld.global.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i16 addrspace(1)* %ptr
   ret i16 %a
@@ -43,7 +43,7 @@ define i16 @ld_global_i16(i16 addrspace(1)* %ptr) {
 define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) {
 ; PTX32: ld.shared.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i16 addrspace(3)* %ptr
   ret i16 %a
@@ -52,7 +52,7 @@ define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) {
 define i16 @ld_local_i16(i16 addrspace(5)* %ptr) {
 ; PTX32: ld.local.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i16 addrspace(5)* %ptr
   ret i16 %a
@@ -62,7 +62,7 @@ define i16 @ld_local_i16(i16 addrspace(5)* %ptr) {
 define i32 @ld_global_i32(i32 addrspace(1)* %ptr) {
 ; PTX32: ld.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i32 addrspace(1)* %ptr
   ret i32 %a
@@ -71,7 +71,7 @@ define i32 @ld_global_i32(i32 addrspace(1)* %ptr) {
 define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) {
 ; PTX32: ld.shared.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i32 addrspace(3)* %ptr
   ret i32 %a
@@ -80,7 +80,7 @@ define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) {
 define i32 @ld_local_i32(i32 addrspace(5)* %ptr) {
 ; PTX32: ld.local.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i32 addrspace(5)* %ptr
   ret i32 %a
@@ -88,27 +88,27 @@ define i32 @ld_local_i32(i32 addrspace(5)* %ptr) {
 
 ;; i64
 define i64 @ld_global_i64(i64 addrspace(1)* %ptr) {
-; PTX32: ld.global.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.global.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.global.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i64 addrspace(1)* %ptr
   ret i64 %a
 }
 
 define i64 @ld_shared_i64(i64 addrspace(3)* %ptr) {
-; PTX32: ld.shared.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.shared.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.shared.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i64 addrspace(3)* %ptr
   ret i64 %a
 }
 
 define i64 @ld_local_i64(i64 addrspace(5)* %ptr) {
-; PTX32: ld.local.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.local.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.local.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i64 addrspace(5)* %ptr
   ret i64 %a
@@ -118,7 +118,7 @@ define i64 @ld_local_i64(i64 addrspace(5)* %ptr) {
 define float @ld_global_f32(float addrspace(1)* %ptr) {
 ; PTX32: ld.global.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load float addrspace(1)* %ptr
   ret float %a
@@ -127,7 +127,7 @@ define float @ld_global_f32(float addrspace(1)* %ptr) {
 define float @ld_shared_f32(float addrspace(3)* %ptr) {
 ; PTX32: ld.shared.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load float addrspace(3)* %ptr
   ret float %a
@@ -136,7 +136,7 @@ define float @ld_shared_f32(float addrspace(3)* %ptr) {
 define float @ld_local_f32(float addrspace(5)* %ptr) {
 ; PTX32: ld.local.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load float addrspace(5)* %ptr
   ret float %a
@@ -144,27 +144,27 @@ define float @ld_local_f32(float addrspace(5)* %ptr) {
 
 ;; f64
 define double @ld_global_f64(double addrspace(1)* %ptr) {
-; PTX32: ld.global.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.global.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.global.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load double addrspace(1)* %ptr
   ret double %a
 }
 
 define double @ld_shared_f64(double addrspace(3)* %ptr) {
-; PTX32: ld.shared.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.shared.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.shared.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load double addrspace(3)* %ptr
   ret double %a
 }
 
 define double @ld_local_f64(double addrspace(5)* %ptr) {
-; PTX32: ld.local.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.local.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.local.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load double addrspace(5)* %ptr
   ret double %a
diff --git a/test/CodeGen/NVPTX/ld-generic.ll b/test/CodeGen/NVPTX/ld-generic.ll
index 3728268..d629e0e 100644
--- a/test/CodeGen/NVPTX/ld-generic.ll
+++ b/test/CodeGen/NVPTX/ld-generic.ll
@@ -6,7 +6,7 @@
 define i8 @ld_global_i8(i8 addrspace(0)* %ptr) {
 ; PTX32: ld.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i8 addrspace(0)* %ptr
   ret i8 %a
@@ -16,7 +16,7 @@ define i8 @ld_global_i8(i8 addrspace(0)* %ptr) {
 define i16 @ld_global_i16(i16 addrspace(0)* %ptr) {
 ; PTX32: ld.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i16 addrspace(0)* %ptr
   ret i16 %a
@@ -26,7 +26,7 @@ define i16 @ld_global_i16(i16 addrspace(0)* %ptr) {
 define i32 @ld_global_i32(i32 addrspace(0)* %ptr) {
 ; PTX32: ld.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i32 addrspace(0)* %ptr
   ret i32 %a
@@ -34,9 +34,9 @@ define i32 @ld_global_i32(i32 addrspace(0)* %ptr) {
 
 ;; i64
 define i64 @ld_global_i64(i64 addrspace(0)* %ptr) {
-; PTX32: ld.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i64 addrspace(0)* %ptr
   ret i64 %a
@@ -46,7 +46,7 @@ define i64 @ld_global_i64(i64 addrspace(0)* %ptr) {
 define float @ld_global_f32(float addrspace(0)* %ptr) {
 ; PTX32: ld.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load float addrspace(0)* %ptr
   ret float %a
@@ -54,9 +54,9 @@ define float @ld_global_f32(float addrspace(0)* %ptr) {
 
 ;; f64
 define double @ld_global_f64(double addrspace(0)* %ptr) {
-; PTX32: ld.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load double addrspace(0)* %ptr
   ret double %a
diff --git a/test/CodeGen/NVPTX/ldu-i8.ll b/test/CodeGen/NVPTX/ldu-i8.ll
index 9cc6675..36c99b3 100644
--- a/test/CodeGen/NVPTX/ldu-i8.ll
+++ b/test/CodeGen/NVPTX/ldu-i8.ll
@@ -2,15 +2,13 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
-declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*)
+declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*, i32)
 
 define i8 @foo(i8* %a) {
 ; Ensure we properly truncate off the high-order 24 bits
 ; CHECK:        ldu.global.u8
 ; CHECK:        cvt.u32.u16
 ; CHECK:        and.b32         %r{{[0-9]+}}, %r{{[0-9]+}}, 255
-  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a), !align !0
+  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a, i32 4)
   ret i8 %val
 }
-
-!0 = metadata !{i32 4}
diff --git a/test/CodeGen/NVPTX/ldu-ldg.ll b/test/CodeGen/NVPTX/ldu-ldg.ll
index 3b0619f..4bfd68c 100644
--- a/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -1,40 +1,36 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 
 
-declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr)
-declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr)
-declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr)
-declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr)
+declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
+declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
+declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
+declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
 
 
 ; CHECK: func0
 define i8 @func0(i8 addrspace(1)* %ptr) {
 ; ldu.global.u8
-  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0
+  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4)
   ret i8 %val
 }
 
 ; CHECK: func1
 define i32 @func1(i32 addrspace(1)* %ptr) {
 ; ldu.global.u32
-  %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0
+  %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
   ret i32 %val
 }
 
 ; CHECK: func2
 define i8 @func2(i8 addrspace(1)* %ptr) {
 ; ld.global.nc.u8
-  %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0
+  %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4)
   ret i8 %val
 }
 
 ; CHECK: func3
 define i32 @func3(i32 addrspace(1)* %ptr) {
 ; ld.global.nc.u32
-  %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0
+  %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
   ret i32 %val
 }
-
-
-
-!0 = metadata !{i32 4}
diff --git a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
index 55707ea..fd35a75 100644
--- a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
+++ b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
@@ -7,15 +7,13 @@ define void @reg_plus_offset(i32* %a) {
 ; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%r{{[0-9]+}}+32];
 ; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%r{{[0-9]+}}+36];
   %p2 = getelementptr i32* %a, i32 8
-  %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2), !align !1
+  %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2, i32 4)
   %p3 = getelementptr i32* %a, i32 9
-  %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3), !align !1
+  %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3, i32 4)
   %t3 = mul i32 %t1, %t2
   store i32 %t3, i32* %a
   ret void
 }
 
-!1 = metadata !{ i32 4 }
-
-declare i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32*)
+declare i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32*, i32)
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
diff --git a/test/CodeGen/NVPTX/local-stack-frame.ll b/test/CodeGen/NVPTX/local-stack-frame.ll
index c0d7d1c..377eee9 100644
--- a/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -7,8 +7,8 @@
 ; PTX32:        cvta.local.u32   %SP, %r{{[0-9]+}};
 ; PTX32:        ld.param.u32     %r{{[0-9]+}}, [foo_param_0];
 ; PTX32:        st.volatile.u32  [%SP+0], %r{{[0-9]+}};
-; PTX64:        mov.u64          %rl{{[0-9]+}}, __local_depot{{[0-9]+}};
-; PTX64:        cvta.local.u64   %SP, %rl{{[0-9]+}};
+; PTX64:        mov.u64          %rd{{[0-9]+}}, __local_depot{{[0-9]+}};
+; PTX64:        cvta.local.u64   %SP, %rd{{[0-9]+}};
 ; PTX64:        ld.param.u32     %r{{[0-9]+}}, [foo_param_0];
 ; PTX64:        st.volatile.u32  [%SP+0], %r{{[0-9]+}};
 define void @foo(i32 %a) {
diff --git a/test/CodeGen/NVPTX/machine-sink.ll b/test/CodeGen/NVPTX/machine-sink.ll
new file mode 100644
index 0000000..3614bea
--- /dev/null
+++ b/test/CodeGen/NVPTX/machine-sink.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+@scalar1 = internal addrspace(3) global float 0.000000e+00, align 4
+@scalar2 = internal addrspace(3) global float 0.000000e+00, align 4
+
+; We shouldn't sink mul.rn.f32 to BB %merge because BB %merge post-dominates
+; BB %entry. Over-sinking created more register pressure on this example. The
+; backend would sink the fmuls to BB %merge, but not the loads for being
+; conservative on sinking memory accesses. As a result, the loads and
+; the two fmuls would be separated to two basic blocks, causing two
+; cross-BB live ranges.
+define float @post_dominate(float %x, i1 %cond) {
+; CHECK-LABEL: post_dominate(
+entry:
+  %0 = load float* addrspacecast (float addrspace(3)* @scalar1 to float*), align 4
+  %1 = load float* addrspacecast (float addrspace(3)* @scalar2 to float*), align 4
+; CHECK: ld.shared.f32
+; CHECK: ld.shared.f32
+  %2 = fmul float %0, %0
+  %3 = fmul float %1, %2
+; CHECK-NOT: bra
+; CHECK: mul.rn.f32
+; CHECK: mul.rn.f32
+  br i1 %cond, label %then, label %merge
+
+then:
+  %z = fadd float %x, %x
+  br label %then2
+
+then2:
+  %z2 = fadd float %z, %z
+  br label %merge
+
+merge:
+  %y = phi float [ 0.0, %entry ], [ %z2, %then2 ]
+  %w = fadd float %y, %3
+  ret float %w
+}
diff --git a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
new file mode 100644
index 0000000..90c9c43
--- /dev/null
+++ b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: t1
+define <4 x float> @t1(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK-NOT: ld.v2
+; CHECK-NOT: ld.f32
+; CHECK: ld.u8
+  %cast = bitcast i8* %p1 to <4 x float>*
+  %r = load <4 x float>* %cast, align 1
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: t2
+define <4 x float> @t2(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK-NOT: ld.v2
+; CHECK: ld.f32
+  %cast = bitcast i8* %p1 to <4 x float>*
+  %r = load <4 x float>* %cast, align 4
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: t3
+define <4 x float> @t3(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK: ld.v2
+  %cast = bitcast i8* %p1 to <4 x float>*
+  %r = load <4 x float>* %cast, align 8
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: t4
+define <4 x float> @t4(i8* %p1) {
+; CHECK: ld.v4
+  %cast = bitcast i8* %p1 to <4 x float>*
+  %r = load <4 x float>* %cast, align 16
+  ret <4 x float> %r
+}
+
+
+; CHECK-LABEL: s1
+define void @s1(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+; CHECK-NOT: st.v2
+; CHECK-NOT: st.f32
+; CHECK: st.u8
+  store <4 x float> %v, <4 x float>* %p1, align 1
+  ret void
+}
+
+; CHECK-LABEL: s2
+define void @s2(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+; CHECK-NOT: st.v2
+; CHECK: st.f32
+  store <4 x float> %v, <4 x float>* %p1, align 4
+  ret void
+}
+
+; CHECK-LABEL: s3
+define void @s3(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+  store <4 x float> %v, <4 x float>* %p1, align 8
+  ret void
+}
+
+; CHECK-LABEL: s4
+define void @s4(<4 x float>* %p1, <4 x float> %v) {
+; CHECK: st.v4
+  store <4 x float> %v, <4 x float>* %p1, align 16
+  ret void
+}
+
diff --git a/test/CodeGen/NVPTX/mulwide.ll b/test/CodeGen/NVPTX/mulwide.ll
index 927946c..1ddf973 100644
--- a/test/CodeGen/NVPTX/mulwide.ll
+++ b/test/CodeGen/NVPTX/mulwide.ll
@@ -1,37 +1,90 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O3 | FileCheck %s --check-prefix=OPT
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 | FileCheck %s --check-prefix=NOOPT
 
-; CHECK: mulwide16
+; OPT-LABEL: @mulwide16
+; NOOPT-LABEL: @mulwide16
 define i32 @mulwide16(i16 %a, i16 %b) {
-; CHECK: mul.wide.s16
+; OPT: mul.wide.s16
+; NOOPT: mul.lo.s32
   %val0 = sext i16 %a to i32
   %val1 = sext i16 %b to i32
   %val2 = mul i32 %val0, %val1
   ret i32 %val2
 }
 
-; CHECK: mulwideu16
+; OPT-LABEL: @mulwideu16
+; NOOPT-LABEL: @mulwideu16
 define i32 @mulwideu16(i16 %a, i16 %b) {
-; CHECK: mul.wide.u16
+; OPT: mul.wide.u16
+; NOOPT: mul.lo.s32
   %val0 = zext i16 %a to i32
   %val1 = zext i16 %b to i32
   %val2 = mul i32 %val0, %val1
   ret i32 %val2
 }
 
-; CHECK: mulwide32
+; OPT-LABEL: @mulwide8
+; NOOPT-LABEL: @mulwide8
+define i32 @mulwide8(i8 %a, i8 %b) {
+; OPT: mul.wide.s16
+; NOOPT: mul.lo.s32
+  %val0 = sext i8 %a to i32
+  %val1 = sext i8 %b to i32
+  %val2 = mul i32 %val0, %val1
+  ret i32 %val2
+}
+
+; OPT-LABEL: @mulwideu8
+; NOOPT-LABEL: @mulwideu8
+define i32 @mulwideu8(i8 %a, i8 %b) {
+; OPT: mul.wide.u16
+; NOOPT: mul.lo.s32
+  %val0 = zext i8 %a to i32
+  %val1 = zext i8 %b to i32
+  %val2 = mul i32 %val0, %val1
+  ret i32 %val2
+}
+
+; OPT-LABEL: @mulwide32
+; NOOPT-LABEL: @mulwide32
 define i64 @mulwide32(i32 %a, i32 %b) {
-; CHECK: mul.wide.s32
+; OPT: mul.wide.s32
+; NOOPT: mul.lo.s64
   %val0 = sext i32 %a to i64
   %val1 = sext i32 %b to i64
   %val2 = mul i64 %val0, %val1
   ret i64 %val2
 }
 
-; CHECK: mulwideu32
+; OPT-LABEL: @mulwideu32
+; NOOPT-LABEL: @mulwideu32
 define i64 @mulwideu32(i32 %a, i32 %b) {
-; CHECK: mul.wide.u32
+; OPT: mul.wide.u32
+; NOOPT: mul.lo.s64
   %val0 = zext i32 %a to i64
   %val1 = zext i32 %b to i64
   %val2 = mul i64 %val0, %val1
   ret i64 %val2
 }
+
+; OPT-LABEL: @mulwideu7
+; NOOPT-LABEL: @mulwideu7
+define i64 @mulwideu7(i7 %a, i7 %b) {
+; OPT: mul.wide.u32
+; NOOPT: mul.lo.s64
+  %val0 = zext i7 %a to i64
+  %val1 = zext i7 %b to i64
+  %val2 = mul i64 %val0, %val1
+  ret i64 %val2
+}
+
+; OPT-LABEL: @mulwides7
+; NOOPT-LABEL: @mulwides7
+define i64 @mulwides7(i7 %a, i7 %b) {
+; OPT: mul.wide.s32
+; NOOPT: mul.lo.s64
+  %val0 = sext i7 %a to i64
+  %val1 = sext i7 %b to i64
+  %val2 = mul i64 %val0, %val1
+  ret i64 %val2
+}
diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll
index e7a81be..cc67a6f 100644
--- a/test/CodeGen/NVPTX/pr13291-i1-store.ll
+++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -5,7 +5,7 @@ define ptx_kernel void @t1(i1* %a) {
 ; PTX32:      mov.u16 %rs{{[0-9]+}}, 0;
 ; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}};
 ; PTX64:      mov.u16 %rs{{[0-9]+}}, 0;
-; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}};
+; PTX64-NEXT: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}};
   store i1 false, i1* %a
   ret void
 }
@@ -15,7 +15,7 @@ define ptx_kernel void @t2(i1* %a, i8* %b) {
 ; PTX32: ld.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 ; PTX32: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1;
-; PTX64: ld.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 ; PTX64: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 
diff --git a/test/CodeGen/NVPTX/st-addrspace.ll b/test/CodeGen/NVPTX/st-addrspace.ll
index 68c09fe..34a83f3 100644
--- a/test/CodeGen/NVPTX/st-addrspace.ll
+++ b/test/CodeGen/NVPTX/st-addrspace.ll
@@ -7,7 +7,7 @@
 define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) {
 ; PTX32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.global.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i8 %a, i8 addrspace(1)* %ptr
   ret void
@@ -16,7 +16,7 @@ define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) {
 define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) {
 ; PTX32: st.shared.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.shared.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i8 %a, i8 addrspace(3)* %ptr
   ret void
@@ -25,7 +25,7 @@ define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) {
 define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) {
 ; PTX32: st.local.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.local.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i8 %a, i8 addrspace(5)* %ptr
   ret void
@@ -36,7 +36,7 @@ define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) {
 define void @st_global_i16(i16 addrspace(1)* %ptr, i16 %a) {
 ; PTX32: st.global.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.global.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i16 %a, i16 addrspace(1)* %ptr
   ret void
@@ -45,7 +45,7 @@ define void @st_global_i16(i16 addrspace(1)* %ptr, i16 %a) {
 define void @st_shared_i16(i16 addrspace(3)* %ptr, i16 %a) {
 ; PTX32: st.shared.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.shared.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i16 %a, i16 addrspace(3)* %ptr
   ret void
@@ -54,7 +54,7 @@ define void @st_shared_i16(i16 addrspace(3)* %ptr, i16 %a) {
 define void @st_local_i16(i16 addrspace(5)* %ptr, i16 %a) {
 ; PTX32: st.local.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.local.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i16 %a, i16 addrspace(5)* %ptr
   ret void
@@ -65,7 +65,7 @@ define void @st_local_i16(i16 addrspace(5)* %ptr, i16 %a) {
 define void @st_global_i32(i32 addrspace(1)* %ptr, i32 %a) {
 ; PTX32: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.global.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX64: ret
   store i32 %a, i32 addrspace(1)* %ptr
   ret void
@@ -74,7 +74,7 @@ define void @st_global_i32(i32 addrspace(1)* %ptr, i32 %a) {
 define void @st_shared_i32(i32 addrspace(3)* %ptr, i32 %a) {
 ; PTX32: st.shared.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.shared.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX64: ret
   store i32 %a, i32 addrspace(3)* %ptr
   ret void
@@ -83,7 +83,7 @@ define void @st_shared_i32(i32 addrspace(3)* %ptr, i32 %a) {
 define void @st_local_i32(i32 addrspace(5)* %ptr, i32 %a) {
 ; PTX32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.local.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX64: ret
   store i32 %a, i32 addrspace(5)* %ptr
   ret void
@@ -92,27 +92,27 @@ define void @st_local_i32(i32 addrspace(5)* %ptr, i32 %a) {
 ;; i64
 
 define void @st_global_i64(i64 addrspace(1)* %ptr, i64 %a) {
-; PTX32: st.global.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: st.global.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.global.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX64: ret
   store i64 %a, i64 addrspace(1)* %ptr
   ret void
 }
 
 define void @st_shared_i64(i64 addrspace(3)* %ptr, i64 %a) {
-; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.shared.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX64: ret
   store i64 %a, i64 addrspace(3)* %ptr
   ret void
 }
 
 define void @st_local_i64(i64 addrspace(5)* %ptr, i64 %a) {
-; PTX32: st.local.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: st.local.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.local.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX64: ret
   store i64 %a, i64 addrspace(5)* %ptr
   ret void
@@ -123,7 +123,7 @@ define void @st_local_i64(i64 addrspace(5)* %ptr, i64 %a) {
 define void @st_global_f32(float addrspace(1)* %ptr, float %a) {
 ; PTX32: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.global.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX64: ret
   store float %a, float addrspace(1)* %ptr
   ret void
@@ -132,7 +132,7 @@ define void @st_global_f32(float addrspace(1)* %ptr, float %a) {
 define void @st_shared_f32(float addrspace(3)* %ptr, float %a) {
 ; PTX32: st.shared.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.shared.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX64: ret
   store float %a, float addrspace(3)* %ptr
   ret void
@@ -141,7 +141,7 @@ define void @st_shared_f32(float addrspace(3)* %ptr, float %a) {
 define void @st_local_f32(float addrspace(5)* %ptr, float %a) {
 ; PTX32: st.local.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.local.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX64: ret
   store float %a, float addrspace(5)* %ptr
   ret void
@@ -150,27 +150,27 @@ define void @st_local_f32(float addrspace(5)* %ptr, float %a) {
 ;; f64
 
 define void @st_global_f64(double addrspace(1)* %ptr, double %a) {
-; PTX32: st.global.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: st.global.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.global.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX64: ret
   store double %a, double addrspace(1)* %ptr
   ret void
 }
 
 define void @st_shared_f64(double addrspace(3)* %ptr, double %a) {
-; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.shared.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX64: ret
   store double %a, double addrspace(3)* %ptr
   ret void
 }
 
 define void @st_local_f64(double addrspace(5)* %ptr, double %a) {
-; PTX32: st.local.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: st.local.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.local.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX64: ret
   store double %a, double addrspace(5)* %ptr
   ret void
diff --git a/test/CodeGen/NVPTX/st-generic.ll b/test/CodeGen/NVPTX/st-generic.ll
index b9c616f..022f7ab 100644
--- a/test/CodeGen/NVPTX/st-generic.ll
+++ b/test/CodeGen/NVPTX/st-generic.ll
@@ -7,7 +7,7 @@
 define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) {
 ; PTX32: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i8 %a, i8 addrspace(0)* %ptr
   ret void
@@ -18,7 +18,7 @@ define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) {
 define void @st_global_i16(i16 addrspace(0)* %ptr, i16 %a) {
 ; PTX32: st.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i16 %a, i16 addrspace(0)* %ptr
   ret void
@@ -29,7 +29,7 @@ define void @st_global_i16(i16 addrspace(0)* %ptr, i16 %a) {
 define void @st_global_i32(i32 addrspace(0)* %ptr, i32 %a) {
 ; PTX32: st.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX64: ret
   store i32 %a, i32 addrspace(0)* %ptr
   ret void
@@ -38,9 +38,9 @@ define void @st_global_i32(i32 addrspace(0)* %ptr, i32 %a) {
 ;; i64
 
 define void @st_global_i64(i64 addrspace(0)* %ptr, i64 %a) {
-; PTX32: st.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: st.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX64: ret
   store i64 %a, i64 addrspace(0)* %ptr
   ret void
@@ -51,7 +51,7 @@ define void @st_global_i64(i64 addrspace(0)* %ptr, i64 %a) {
 define void @st_global_f32(float addrspace(0)* %ptr, float %a) {
 ; PTX32: st.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX64: ret
   store float %a, float addrspace(0)* %ptr
   ret void
@@ -60,9 +60,9 @@ define void @st_global_f32(float addrspace(0)* %ptr, float %a) {
 ;; f64
 
 define void @st_global_f64(double addrspace(0)* %ptr, double %a) {
-; PTX32: st.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: st.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX64: ret
   store double %a, double addrspace(0)* %ptr
   ret void
diff --git a/test/CodeGen/NVPTX/surf-read-cuda.ll b/test/CodeGen/NVPTX/surf-read-cuda.ll
new file mode 100644
index 0000000..10a1ecc
--- /dev/null
+++ b/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30
+
+target triple = "nvptx-unknown-cuda"
+
+declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
+declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
+
+
+; SM20-LABEL: .entry foo
+; SM30-LABEL: .entry foo
+define void @foo(i64 %img, float* %red, i32 %idx) {
+; SM20: ld.param.u64    %rd[[SURFREG:[0-9]+]], [foo_param_0];
+; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}]
+; SM30: ld.param.u64    %rd[[SURFREG:[0-9]+]], [foo_param_0];
+; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}]
+  %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
+; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+  %ret = sitofp i32 %val to float
+; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+  store float %ret, float* %red
+  ret void
+}
+
+@surf0 = internal addrspace(1) global i64 0, align 8
+
+; SM20-LABEL: .entry bar
+; SM30-LABEL: .entry bar
+define void @bar(float* %red, i32 %idx) {
+; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0
+  %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
+; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [surf0, {%r{{[0-9]+}}}]
+; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFHANDLE]], {%r{{[0-9]+}}}]
+  %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %surfHandle, i32 %idx)
+; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+  %ret = sitofp i32 %val to float
+; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+  store float %ret, float* %red
+  ret void
+}
+
+
+
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1}
+!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
+
diff --git a/test/CodeGen/NVPTX/surf-write-cuda.ll b/test/CodeGen/NVPTX/surf-write-cuda.ll
new file mode 100644
index 0000000..654c47f
--- /dev/null
+++ b/test/CodeGen/NVPTX/surf-write-cuda.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30
+
+target triple = "nvptx-unknown-cuda"
+
+declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
+declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
+
+
+; SM20-LABEL: .entry foo
+; SM30-LABEL: .entry foo
+define void @foo(i64 %img, i32 %val, i32 %idx) {
+; SM20: ld.param.u64    %rd[[SURFREG:[0-9]+]], [foo_param_0];
+; SM20: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+; SM30: ld.param.u64    %rd[[SURFREG:[0-9]+]], [foo_param_0];
+; SM30: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+  tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val)
+  ret void
+}
+
+
+@surf0 = internal addrspace(1) global i64 0, align 8
+
+
+
+; SM20-LABEL: .entry bar
+; SM30-LABEL: .entry bar
+define void @bar(i32 %val, i32 %idx) {
+; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0
+  %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
+; SM20: sust.b.1d.b32.trap [surf0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+; SM30: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+  tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %surfHandle, i32 %idx, i32 %val)
+  ret void
+}
+
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = metadata !{void (i64, i32, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (i32, i32)* @bar, metadata !"kernel", i32 1}
+!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
+
diff --git a/test/CodeGen/NVPTX/tex-read-cuda.ll b/test/CodeGen/NVPTX/tex-read-cuda.ll
new file mode 100644
index 0000000..ee0cefa
--- /dev/null
+++ b/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30
+
+
+target triple = "nvptx-unknown-cuda"
+
+declare { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64, i32)
+declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
+
+; SM20-LABEL: .entry foo
+; SM30-LABEL: .entry foo
+define void @foo(i64 %img, float* %red, i32 %idx) {
+; SM20: ld.param.u64    %rd[[TEXREG:[0-9]+]], [foo_param_0];
+; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}]
+; SM30: ld.param.u64    %rd[[TEXREG:[0-9]+]], [foo_param_0];
+; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}]
+  %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %img, i32 %idx)
+  %ret = extractvalue { float, float, float, float } %val, 0
+; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+  store float %ret, float* %red
+  ret void
+}
+
+
+@tex0 = internal addrspace(1) global i64 0, align 8
+
+; SM20-LABEL: .entry bar
+; SM30-LABEL: .entry bar
+define void @bar(float* %red, i32 %idx) {
+; SM30: mov.u64 %rd[[TEXHANDLE:[0-9]+]], tex0 
+  %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
+; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [tex0, {%r{{[0-9]+}}}]
+; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXHANDLE]], {%r{{[0-9]+}}}]
+  %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx)
+  %ret = extractvalue { float, float, float, float } %val, 0
+; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+  store float %ret, float* %red
+  ret void
+}
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1}
+!3 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1}
diff --git a/test/CodeGen/NVPTX/tex-read.ll b/test/CodeGen/NVPTX/tex-read.ll
index 291060b..55e4bfc 100644
--- a/test/CodeGen/NVPTX/tex-read.ll
+++ b/test/CodeGen/NVPTX/tex-read.ll
@@ -2,12 +2,12 @@
 
 target triple = "nvptx-unknown-nvcl"
 
-declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64, i64, i32)
+declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64, i64, i32)
 
 ; CHECK: .entry foo
 define void @foo(i64 %img, i64 %sampler, float* %red, i32 %idx) {
 ; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
-  %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64 %img, i64 %sampler, i32 %idx)
+  %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
 ; CHECK: st.f32 [%r{{[0-9]+}}], %f[[RED]]
   store float %ret, float* %red
diff --git a/test/CodeGen/NVPTX/texsurf-queries.ll b/test/CodeGen/NVPTX/texsurf-queries.ll
new file mode 100644
index 0000000..c7637cc
--- /dev/null
+++ b/test/CodeGen/NVPTX/texsurf-queries.ll
@@ -0,0 +1,103 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30
+
+target triple = "nvptx-unknown-cuda"
+
+@tex0 = internal addrspace(1) global i64 0, align 8
+@surf0 = internal addrspace(1) global i64 0, align 8
+
+declare i32 @llvm.nvvm.txq.width(i64)
+declare i32 @llvm.nvvm.txq.height(i64)
+declare i32 @llvm.nvvm.suq.width(i64)
+declare i32 @llvm.nvvm.suq.height(i64)
+declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
+
+
+; SM20-LABEL: @t0
+; SM30-LABEL: @t0
+define i32 @t0(i64 %texHandle) {
+; SM20: txq.width.b32
+; SM30: txq.width.b32
+  %width = tail call i32 @llvm.nvvm.txq.width(i64 %texHandle)
+  ret i32 %width
+}
+
+; SM20-LABEL: @t1
+; SM30-LABEL: @t1
+define i32 @t1() {
+; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], tex0
+  %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
+; SM20: txq.width.b32 %r{{[0-9]+}}, [tex0]
+; SM30: txq.width.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
+  %width = tail call i32 @llvm.nvvm.txq.width(i64 %texHandle)
+  ret i32 %width
+}
+
+
+; SM20-LABEL: @t2
+; SM30-LABEL: @t2
+define i32 @t2(i64 %texHandle) {
+; SM20: txq.height.b32
+; SM30: txq.height.b32
+  %height = tail call i32 @llvm.nvvm.txq.height(i64 %texHandle)
+  ret i32 %height
+}
+
+; SM20-LABEL: @t3
+; SM30-LABEL: @t3
+define i32 @t3() {
+; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], tex0
+  %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
+; SM20: txq.height.b32 %r{{[0-9]+}}, [tex0]
+; SM30: txq.height.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
+  %height = tail call i32 @llvm.nvvm.txq.height(i64 %texHandle)
+  ret i32 %height
+}
+
+
+; SM20-LABEL: @s0
+; SM30-LABEL: @s0
+define i32 @s0(i64 %surfHandle) {
+; SM20: suq.width.b32
+; SM30: suq.width.b32
+  %width = tail call i32 @llvm.nvvm.suq.width(i64 %surfHandle)
+  ret i32 %width
+}
+
+; SM20-LABEL: @s1
+; SM30-LABEL: @s1
+define i32 @s1() {
+; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], surf0
+  %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
+; SM20: suq.width.b32 %r{{[0-9]+}}, [surf0]
+; SM30: suq.width.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
+  %width = tail call i32 @llvm.nvvm.suq.width(i64 %surfHandle)
+  ret i32 %width
+}
+
+
+; SM20-LABEL: @s2
+; SM30-LABEL: @s2
+define i32 @s2(i64 %surfHandle) {
+; SM20: suq.height.b32
+; SM30: suq.height.b32
+  %height = tail call i32 @llvm.nvvm.suq.height(i64 %surfHandle)
+  ret i32 %height
+}
+
+; SM20-LABEL: @s3
+; SM30-LABEL: @s3
+define i32 @s3() {
+; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], surf0
+  %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
+; SM20: suq.height.b32 %r{{[0-9]+}}, [surf0]
+; SM30: suq.height.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
+  %height = tail call i32 @llvm.nvvm.suq.height(i64 %surfHandle)
+  ret i32 %height
+}
+
+
+
+!nvvm.annotations = !{!1, !2}
+!1 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1}
+!2 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll
new file mode 100644
index 0000000..a03d7fd
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-call.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-cuda"
+
+declare void @bar(<4 x i32>)
+
+; CHECK-LABEL @foo
+define void @foo(<4 x i32> %a) {
+; CHECK: st.param.v4.b32
+  tail call void @bar(<4 x i32> %a)
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/vector-return.ll b/test/CodeGen/NVPTX/vector-return.ll
new file mode 100644
index 0000000..15e50f8
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-return.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+
+declare <2 x float> @bar(<2 x float> %input)
+
+define void @foo(<2 x float> %input, <2 x float>* %output) {
+; CHECK-LABEL: @foo
+entry:
+  %call = tail call <2 x float> @bar(<2 x float> %input)
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: ld.param.v2.f32 {[[ELEM1:%f[0-9]+]], [[ELEM2:%f[0-9]+]]}, [retval0+0];
+  store <2 x float> %call, <2 x float>* %output, align 8
+; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEM1]], [[ELEM2]]}
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/2007-09-08-unaligned.ll b/test/CodeGen/PowerPC/2007-09-08-unaligned.ll
index 898c470..bdd91f3 100644
--- a/test/CodeGen/PowerPC/2007-09-08-unaligned.ll
+++ b/test/CodeGen/PowerPC/2007-09-08-unaligned.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s | grep stfd | count 3
-; RUN: llc < %s | grep stfs | count 1
-; RUN: llc < %s | grep lfd | count 2
-; RUN: llc < %s | grep lfs | count 2
+; RUN: llc -mattr=-vsx < %s | grep stfd | count 3
+; RUN: llc -mattr=-vsx < %s | grep stfs | count 1
+; RUN: llc -mattr=-vsx < %s | grep lfd | count 2
+; RUN: llc -mattr=-vsx < %s | grep lfs | count 2
 ; ModuleID = 'foo.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin8"
diff --git a/test/CodeGen/PowerPC/2012-10-12-bitcast.ll b/test/CodeGen/PowerPC/2012-10-12-bitcast.ll
index f841c5f..fdacef2 100644
--- a/test/CodeGen/PowerPC/2012-10-12-bitcast.ll
+++ b/test/CodeGen/PowerPC/2012-10-12-bitcast.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mattr=+altivec < %s | FileCheck %s
+; RUN: llc -mattr=-vsx -mattr=+altivec -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mattr=+vsx -mattr=+altivec -mcpu=pwr7 < %s | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -18,3 +19,7 @@ entry:
 ; CHECK: lwz 3, -16(1)
 ; CHECK: blr
 
+; CHECK-VSX: addi [[REGISTER:[0-9]+]], 1, -16
+; CHECK-VSX: stxvd2x 34, 0, [[REGISTER]]
+; CHECK-VSX: lwz 3, -16(1)
+; CHECK-VSX: blr
diff --git a/test/CodeGen/PowerPC/Atomics-32.ll b/test/CodeGen/PowerPC/Atomics-32.ll
deleted file mode 100644
index b7f23b1..0000000
--- a/test/CodeGen/PowerPC/Atomics-32.ll
+++ /dev/null
@@ -1,715 +0,0 @@
-; RUN: llc < %s -march=ppc32
-target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
-target triple = "powerpc-apple-darwin9"
-
-@sc = common global i8 0
-@uc = common global i8 0
-@ss = common global i16 0
-@us = common global i16 0
-@si = common global i32 0
-@ui = common global i32 0
-@sl = common global i32 0
-@ul = common global i32 0
-@sll = common global i64 0, align 8
-@ull = common global i64 0, align 8
-
-define void @test_op_ignore() nounwind {
-entry:
-  %0 = atomicrmw add i8* @sc, i8 1 monotonic
-  %1 = atomicrmw add i8* @uc, i8 1 monotonic
-  %2 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %3 = atomicrmw add i16* %2, i16 1 monotonic
-  %4 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %5 = atomicrmw add i16* %4, i16 1 monotonic
-  %6 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %7 = atomicrmw add i32* %6, i32 1 monotonic
-  %8 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %9 = atomicrmw add i32* %8, i32 1 monotonic
-  %10 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %11 = atomicrmw add i32* %10, i32 1 monotonic
-  %12 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %13 = atomicrmw add i32* %12, i32 1 monotonic
-  %14 = atomicrmw sub i8* @sc, i8 1 monotonic
-  %15 = atomicrmw sub i8* @uc, i8 1 monotonic
-  %16 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %17 = atomicrmw sub i16* %16, i16 1 monotonic
-  %18 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %19 = atomicrmw sub i16* %18, i16 1 monotonic
-  %20 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %21 = atomicrmw sub i32* %20, i32 1 monotonic
-  %22 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %23 = atomicrmw sub i32* %22, i32 1 monotonic
-  %24 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %25 = atomicrmw sub i32* %24, i32 1 monotonic
-  %26 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %27 = atomicrmw sub i32* %26, i32 1 monotonic
-  %28 = atomicrmw or i8* @sc, i8 1 monotonic
-  %29 = atomicrmw or i8* @uc, i8 1 monotonic
-  %30 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %31 = atomicrmw or i16* %30, i16 1 monotonic
-  %32 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %33 = atomicrmw or i16* %32, i16 1 monotonic
-  %34 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %35 = atomicrmw or i32* %34, i32 1 monotonic
-  %36 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %37 = atomicrmw or i32* %36, i32 1 monotonic
-  %38 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %39 = atomicrmw or i32* %38, i32 1 monotonic
-  %40 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %41 = atomicrmw or i32* %40, i32 1 monotonic
-  %42 = atomicrmw xor i8* @sc, i8 1 monotonic
-  %43 = atomicrmw xor i8* @uc, i8 1 monotonic
-  %44 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %45 = atomicrmw xor i16* %44, i16 1 monotonic
-  %46 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %47 = atomicrmw xor i16* %46, i16 1 monotonic
-  %48 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %49 = atomicrmw xor i32* %48, i32 1 monotonic
-  %50 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %51 = atomicrmw xor i32* %50, i32 1 monotonic
-  %52 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %53 = atomicrmw xor i32* %52, i32 1 monotonic
-  %54 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %55 = atomicrmw xor i32* %54, i32 1 monotonic
-  %56 = atomicrmw and i8* @sc, i8 1 monotonic
-  %57 = atomicrmw and i8* @uc, i8 1 monotonic
-  %58 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %59 = atomicrmw and i16* %58, i16 1 monotonic
-  %60 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %61 = atomicrmw and i16* %60, i16 1 monotonic
-  %62 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %63 = atomicrmw and i32* %62, i32 1 monotonic
-  %64 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %65 = atomicrmw and i32* %64, i32 1 monotonic
-  %66 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %67 = atomicrmw and i32* %66, i32 1 monotonic
-  %68 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %69 = atomicrmw and i32* %68, i32 1 monotonic
-  %70 = atomicrmw nand i8* @sc, i8 1 monotonic
-  %71 = atomicrmw nand i8* @uc, i8 1 monotonic
-  %72 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %73 = atomicrmw nand i16* %72, i16 1 monotonic
-  %74 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %75 = atomicrmw nand i16* %74, i16 1 monotonic
-  %76 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %77 = atomicrmw nand i32* %76, i32 1 monotonic
-  %78 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %79 = atomicrmw nand i32* %78, i32 1 monotonic
-  %80 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %81 = atomicrmw nand i32* %80, i32 1 monotonic
-  %82 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %83 = atomicrmw nand i32* %82, i32 1 monotonic
-  br label %return
-
-return:                                           ; preds = %entry
-  ret void
-}
-
-define void @test_fetch_and_op() nounwind {
-entry:
-  %0 = atomicrmw add i8* @sc, i8 11 monotonic
-  store i8 %0, i8* @sc, align 1
-  %1 = atomicrmw add i8* @uc, i8 11 monotonic
-  store i8 %1, i8* @uc, align 1
-  %2 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %3 = atomicrmw add i16* %2, i16 11 monotonic
-  store i16 %3, i16* @ss, align 2
-  %4 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %5 = atomicrmw add i16* %4, i16 11 monotonic
-  store i16 %5, i16* @us, align 2
-  %6 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %7 = atomicrmw add i32* %6, i32 11 monotonic
-  store i32 %7, i32* @si, align 4
-  %8 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %9 = atomicrmw add i32* %8, i32 11 monotonic
-  store i32 %9, i32* @ui, align 4
-  %10 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %11 = atomicrmw add i32* %10, i32 11 monotonic
-  store i32 %11, i32* @sl, align 4
-  %12 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %13 = atomicrmw add i32* %12, i32 11 monotonic
-  store i32 %13, i32* @ul, align 4
-  %14 = atomicrmw sub i8* @sc, i8 11 monotonic
-  store i8 %14, i8* @sc, align 1
-  %15 = atomicrmw sub i8* @uc, i8 11 monotonic
-  store i8 %15, i8* @uc, align 1
-  %16 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %17 = atomicrmw sub i16* %16, i16 11 monotonic
-  store i16 %17, i16* @ss, align 2
-  %18 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %19 = atomicrmw sub i16* %18, i16 11 monotonic
-  store i16 %19, i16* @us, align 2
-  %20 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %21 = atomicrmw sub i32* %20, i32 11 monotonic
-  store i32 %21, i32* @si, align 4
-  %22 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %23 = atomicrmw sub i32* %22, i32 11 monotonic
-  store i32 %23, i32* @ui, align 4
-  %24 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %25 = atomicrmw sub i32* %24, i32 11 monotonic
-  store i32 %25, i32* @sl, align 4
-  %26 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %27 = atomicrmw sub i32* %26, i32 11 monotonic
-  store i32 %27, i32* @ul, align 4
-  %28 = atomicrmw or i8* @sc, i8 11 monotonic
-  store i8 %28, i8* @sc, align 1
-  %29 = atomicrmw or i8* @uc, i8 11 monotonic
-  store i8 %29, i8* @uc, align 1
-  %30 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %31 = atomicrmw or i16* %30, i16 11 monotonic
-  store i16 %31, i16* @ss, align 2
-  %32 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %33 = atomicrmw or i16* %32, i16 11 monotonic
-  store i16 %33, i16* @us, align 2
-  %34 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %35 = atomicrmw or i32* %34, i32 11 monotonic
-  store i32 %35, i32* @si, align 4
-  %36 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %37 = atomicrmw or i32* %36, i32 11 monotonic
-  store i32 %37, i32* @ui, align 4
-  %38 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %39 = atomicrmw or i32* %38, i32 11 monotonic
-  store i32 %39, i32* @sl, align 4
-  %40 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %41 = atomicrmw or i32* %40, i32 11 monotonic
-  store i32 %41, i32* @ul, align 4
-  %42 = atomicrmw xor i8* @sc, i8 11 monotonic
-  store i8 %42, i8* @sc, align 1
-  %43 = atomicrmw xor i8* @uc, i8 11 monotonic
-  store i8 %43, i8* @uc, align 1
-  %44 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %45 = atomicrmw xor i16* %44, i16 11 monotonic
-  store i16 %45, i16* @ss, align 2
-  %46 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %47 = atomicrmw xor i16* %46, i16 11 monotonic
-  store i16 %47, i16* @us, align 2
-  %48 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %49 = atomicrmw xor i32* %48, i32 11 monotonic
-  store i32 %49, i32* @si, align 4
-  %50 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %51 = atomicrmw xor i32* %50, i32 11 monotonic
-  store i32 %51, i32* @ui, align 4
-  %52 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %53 = atomicrmw xor i32* %52, i32 11 monotonic
-  store i32 %53, i32* @sl, align 4
-  %54 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %55 = atomicrmw xor i32* %54, i32 11 monotonic
-  store i32 %55, i32* @ul, align 4
-  %56 = atomicrmw and i8* @sc, i8 11 monotonic
-  store i8 %56, i8* @sc, align 1
-  %57 = atomicrmw and i8* @uc, i8 11 monotonic
-  store i8 %57, i8* @uc, align 1
-  %58 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %59 = atomicrmw and i16* %58, i16 11 monotonic
-  store i16 %59, i16* @ss, align 2
-  %60 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %61 = atomicrmw and i16* %60, i16 11 monotonic
-  store i16 %61, i16* @us, align 2
-  %62 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %63 = atomicrmw and i32* %62, i32 11 monotonic
-  store i32 %63, i32* @si, align 4
-  %64 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %65 = atomicrmw and i32* %64, i32 11 monotonic
-  store i32 %65, i32* @ui, align 4
-  %66 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %67 = atomicrmw and i32* %66, i32 11 monotonic
-  store i32 %67, i32* @sl, align 4
-  %68 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %69 = atomicrmw and i32* %68, i32 11 monotonic
-  store i32 %69, i32* @ul, align 4
-  %70 = atomicrmw nand i8* @sc, i8 11 monotonic
-  store i8 %70, i8* @sc, align 1
-  %71 = atomicrmw nand i8* @uc, i8 11 monotonic
-  store i8 %71, i8* @uc, align 1
-  %72 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %73 = atomicrmw nand i16* %72, i16 11 monotonic
-  store i16 %73, i16* @ss, align 2
-  %74 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %75 = atomicrmw nand i16* %74, i16 11 monotonic
-  store i16 %75, i16* @us, align 2
-  %76 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %77 = atomicrmw nand i32* %76, i32 11 monotonic
-  store i32 %77, i32* @si, align 4
-  %78 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %79 = atomicrmw nand i32* %78, i32 11 monotonic
-  store i32 %79, i32* @ui, align 4
-  %80 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %81 = atomicrmw nand i32* %80, i32 11 monotonic
-  store i32 %81, i32* @sl, align 4
-  %82 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %83 = atomicrmw nand i32* %82, i32 11 monotonic
-  store i32 %83, i32* @ul, align 4
-  br label %return
-
-return:                                           ; preds = %entry
-  ret void
-}
-
-define void @test_op_and_fetch() nounwind {
-entry:
-  %0 = load i8* @uc, align 1
-  %1 = atomicrmw add i8* @sc, i8 %0 monotonic
-  %2 = add i8 %1, %0
-  store i8 %2, i8* @sc, align 1
-  %3 = load i8* @uc, align 1
-  %4 = atomicrmw add i8* @uc, i8 %3 monotonic
-  %5 = add i8 %4, %3
-  store i8 %5, i8* @uc, align 1
-  %6 = load i8* @uc, align 1
-  %7 = zext i8 %6 to i16
-  %8 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %9 = atomicrmw add i16* %8, i16 %7 monotonic
-  %10 = add i16 %9, %7
-  store i16 %10, i16* @ss, align 2
-  %11 = load i8* @uc, align 1
-  %12 = zext i8 %11 to i16
-  %13 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %14 = atomicrmw add i16* %13, i16 %12 monotonic
-  %15 = add i16 %14, %12
-  store i16 %15, i16* @us, align 2
-  %16 = load i8* @uc, align 1
-  %17 = zext i8 %16 to i32
-  %18 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %19 = atomicrmw add i32* %18, i32 %17 monotonic
-  %20 = add i32 %19, %17
-  store i32 %20, i32* @si, align 4
-  %21 = load i8* @uc, align 1
-  %22 = zext i8 %21 to i32
-  %23 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %24 = atomicrmw add i32* %23, i32 %22 monotonic
-  %25 = add i32 %24, %22
-  store i32 %25, i32* @ui, align 4
-  %26 = load i8* @uc, align 1
-  %27 = zext i8 %26 to i32
-  %28 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %29 = atomicrmw add i32* %28, i32 %27 monotonic
-  %30 = add i32 %29, %27
-  store i32 %30, i32* @sl, align 4
-  %31 = load i8* @uc, align 1
-  %32 = zext i8 %31 to i32
-  %33 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %34 = atomicrmw add i32* %33, i32 %32 monotonic
-  %35 = add i32 %34, %32
-  store i32 %35, i32* @ul, align 4
-  %36 = load i8* @uc, align 1
-  %37 = atomicrmw sub i8* @sc, i8 %36 monotonic
-  %38 = sub i8 %37, %36
-  store i8 %38, i8* @sc, align 1
-  %39 = load i8* @uc, align 1
-  %40 = atomicrmw sub i8* @uc, i8 %39 monotonic
-  %41 = sub i8 %40, %39
-  store i8 %41, i8* @uc, align 1
-  %42 = load i8* @uc, align 1
-  %43 = zext i8 %42 to i16
-  %44 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %45 = atomicrmw sub i16* %44, i16 %43 monotonic
-  %46 = sub i16 %45, %43
-  store i16 %46, i16* @ss, align 2
-  %47 = load i8* @uc, align 1
-  %48 = zext i8 %47 to i16
-  %49 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %50 = atomicrmw sub i16* %49, i16 %48 monotonic
-  %51 = sub i16 %50, %48
-  store i16 %51, i16* @us, align 2
-  %52 = load i8* @uc, align 1
-  %53 = zext i8 %52 to i32
-  %54 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %55 = atomicrmw sub i32* %54, i32 %53 monotonic
-  %56 = sub i32 %55, %53
-  store i32 %56, i32* @si, align 4
-  %57 = load i8* @uc, align 1
-  %58 = zext i8 %57 to i32
-  %59 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %60 = atomicrmw sub i32* %59, i32 %58 monotonic
-  %61 = sub i32 %60, %58
-  store i32 %61, i32* @ui, align 4
-  %62 = load i8* @uc, align 1
-  %63 = zext i8 %62 to i32
-  %64 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %65 = atomicrmw sub i32* %64, i32 %63 monotonic
-  %66 = sub i32 %65, %63
-  store i32 %66, i32* @sl, align 4
-  %67 = load i8* @uc, align 1
-  %68 = zext i8 %67 to i32
-  %69 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %70 = atomicrmw sub i32* %69, i32 %68 monotonic
-  %71 = sub i32 %70, %68
-  store i32 %71, i32* @ul, align 4
-  %72 = load i8* @uc, align 1
-  %73 = atomicrmw or i8* @sc, i8 %72 monotonic
-  %74 = or i8 %73, %72
-  store i8 %74, i8* @sc, align 1
-  %75 = load i8* @uc, align 1
-  %76 = atomicrmw or i8* @uc, i8 %75 monotonic
-  %77 = or i8 %76, %75
-  store i8 %77, i8* @uc, align 1
-  %78 = load i8* @uc, align 1
-  %79 = zext i8 %78 to i16
-  %80 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %81 = atomicrmw or i16* %80, i16 %79 monotonic
-  %82 = or i16 %81, %79
-  store i16 %82, i16* @ss, align 2
-  %83 = load i8* @uc, align 1
-  %84 = zext i8 %83 to i16
-  %85 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %86 = atomicrmw or i16* %85, i16 %84 monotonic
-  %87 = or i16 %86, %84
-  store i16 %87, i16* @us, align 2
-  %88 = load i8* @uc, align 1
-  %89 = zext i8 %88 to i32
-  %90 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %91 = atomicrmw or i32* %90, i32 %89 monotonic
-  %92 = or i32 %91, %89
-  store i32 %92, i32* @si, align 4
-  %93 = load i8* @uc, align 1
-  %94 = zext i8 %93 to i32
-  %95 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %96 = atomicrmw or i32* %95, i32 %94 monotonic
-  %97 = or i32 %96, %94
-  store i32 %97, i32* @ui, align 4
-  %98 = load i8* @uc, align 1
-  %99 = zext i8 %98 to i32
-  %100 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %101 = atomicrmw or i32* %100, i32 %99 monotonic
-  %102 = or i32 %101, %99
-  store i32 %102, i32* @sl, align 4
-  %103 = load i8* @uc, align 1
-  %104 = zext i8 %103 to i32
-  %105 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %106 = atomicrmw or i32* %105, i32 %104 monotonic
-  %107 = or i32 %106, %104
-  store i32 %107, i32* @ul, align 4
-  %108 = load i8* @uc, align 1
-  %109 = atomicrmw xor i8* @sc, i8 %108 monotonic
-  %110 = xor i8 %109, %108
-  store i8 %110, i8* @sc, align 1
-  %111 = load i8* @uc, align 1
-  %112 = atomicrmw xor i8* @uc, i8 %111 monotonic
-  %113 = xor i8 %112, %111
-  store i8 %113, i8* @uc, align 1
-  %114 = load i8* @uc, align 1
-  %115 = zext i8 %114 to i16
-  %116 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %117 = atomicrmw xor i16* %116, i16 %115 monotonic
-  %118 = xor i16 %117, %115
-  store i16 %118, i16* @ss, align 2
-  %119 = load i8* @uc, align 1
-  %120 = zext i8 %119 to i16
-  %121 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %122 = atomicrmw xor i16* %121, i16 %120 monotonic
-  %123 = xor i16 %122, %120
-  store i16 %123, i16* @us, align 2
-  %124 = load i8* @uc, align 1
-  %125 = zext i8 %124 to i32
-  %126 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %127 = atomicrmw xor i32* %126, i32 %125 monotonic
-  %128 = xor i32 %127, %125
-  store i32 %128, i32* @si, align 4
-  %129 = load i8* @uc, align 1
-  %130 = zext i8 %129 to i32
-  %131 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %132 = atomicrmw xor i32* %131, i32 %130 monotonic
-  %133 = xor i32 %132, %130
-  store i32 %133, i32* @ui, align 4
-  %134 = load i8* @uc, align 1
-  %135 = zext i8 %134 to i32
-  %136 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %137 = atomicrmw xor i32* %136, i32 %135 monotonic
-  %138 = xor i32 %137, %135
-  store i32 %138, i32* @sl, align 4
-  %139 = load i8* @uc, align 1
-  %140 = zext i8 %139 to i32
-  %141 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %142 = atomicrmw xor i32* %141, i32 %140 monotonic
-  %143 = xor i32 %142, %140
-  store i32 %143, i32* @ul, align 4
-  %144 = load i8* @uc, align 1
-  %145 = atomicrmw and i8* @sc, i8 %144 monotonic
-  %146 = and i8 %145, %144
-  store i8 %146, i8* @sc, align 1
-  %147 = load i8* @uc, align 1
-  %148 = atomicrmw and i8* @uc, i8 %147 monotonic
-  %149 = and i8 %148, %147
-  store i8 %149, i8* @uc, align 1
-  %150 = load i8* @uc, align 1
-  %151 = zext i8 %150 to i16
-  %152 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %153 = atomicrmw and i16* %152, i16 %151 monotonic
-  %154 = and i16 %153, %151
-  store i16 %154, i16* @ss, align 2
-  %155 = load i8* @uc, align 1
-  %156 = zext i8 %155 to i16
-  %157 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %158 = atomicrmw and i16* %157, i16 %156 monotonic
-  %159 = and i16 %158, %156
-  store i16 %159, i16* @us, align 2
-  %160 = load i8* @uc, align 1
-  %161 = zext i8 %160 to i32
-  %162 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %163 = atomicrmw and i32* %162, i32 %161 monotonic
-  %164 = and i32 %163, %161
-  store i32 %164, i32* @si, align 4
-  %165 = load i8* @uc, align 1
-  %166 = zext i8 %165 to i32
-  %167 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %168 = atomicrmw and i32* %167, i32 %166 monotonic
-  %169 = and i32 %168, %166
-  store i32 %169, i32* @ui, align 4
-  %170 = load i8* @uc, align 1
-  %171 = zext i8 %170 to i32
-  %172 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %173 = atomicrmw and i32* %172, i32 %171 monotonic
-  %174 = and i32 %173, %171
-  store i32 %174, i32* @sl, align 4
-  %175 = load i8* @uc, align 1
-  %176 = zext i8 %175 to i32
-  %177 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %178 = atomicrmw and i32* %177, i32 %176 monotonic
-  %179 = and i32 %178, %176
-  store i32 %179, i32* @ul, align 4
-  %180 = load i8* @uc, align 1
-  %181 = atomicrmw nand i8* @sc, i8 %180 monotonic
-  %182 = xor i8 %181, -1
-  %183 = and i8 %182, %180
-  store i8 %183, i8* @sc, align 1
-  %184 = load i8* @uc, align 1
-  %185 = atomicrmw nand i8* @uc, i8 %184 monotonic
-  %186 = xor i8 %185, -1
-  %187 = and i8 %186, %184
-  store i8 %187, i8* @uc, align 1
-  %188 = load i8* @uc, align 1
-  %189 = zext i8 %188 to i16
-  %190 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %191 = atomicrmw nand i16* %190, i16 %189 monotonic
-  %192 = xor i16 %191, -1
-  %193 = and i16 %192, %189
-  store i16 %193, i16* @ss, align 2
-  %194 = load i8* @uc, align 1
-  %195 = zext i8 %194 to i16
-  %196 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %197 = atomicrmw nand i16* %196, i16 %195 monotonic
-  %198 = xor i16 %197, -1
-  %199 = and i16 %198, %195
-  store i16 %199, i16* @us, align 2
-  %200 = load i8* @uc, align 1
-  %201 = zext i8 %200 to i32
-  %202 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %203 = atomicrmw nand i32* %202, i32 %201 monotonic
-  %204 = xor i32 %203, -1
-  %205 = and i32 %204, %201
-  store i32 %205, i32* @si, align 4
-  %206 = load i8* @uc, align 1
-  %207 = zext i8 %206 to i32
-  %208 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %209 = atomicrmw nand i32* %208, i32 %207 monotonic
-  %210 = xor i32 %209, -1
-  %211 = and i32 %210, %207
-  store i32 %211, i32* @ui, align 4
-  %212 = load i8* @uc, align 1
-  %213 = zext i8 %212 to i32
-  %214 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %215 = atomicrmw nand i32* %214, i32 %213 monotonic
-  %216 = xor i32 %215, -1
-  %217 = and i32 %216, %213
-  store i32 %217, i32* @sl, align 4
-  %218 = load i8* @uc, align 1
-  %219 = zext i8 %218 to i32
-  %220 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %221 = atomicrmw nand i32* %220, i32 %219 monotonic
-  %222 = xor i32 %221, -1
-  %223 = and i32 %222, %219
-  store i32 %223, i32* @ul, align 4
-  br label %return
-
-return:                                           ; preds = %entry
-  ret void
-}
-
-define void @test_compare_and_swap() nounwind {
-entry:
-  %0 = load i8* @uc, align 1
-  %1 = load i8* @sc, align 1
-  %pair2 = cmpxchg i8* @sc, i8 %0, i8 %1 monotonic monotonic
-  %2 = extractvalue { i8, i1 } %pair2, 0
-  store i8 %2, i8* @sc, align 1
-  %3 = load i8* @uc, align 1
-  %4 = load i8* @sc, align 1
-  %pair5 = cmpxchg i8* @uc, i8 %3, i8 %4 monotonic monotonic
-  %5 = extractvalue { i8, i1 } %pair5, 0
-  store i8 %5, i8* @uc, align 1
-  %6 = load i8* @uc, align 1
-  %7 = zext i8 %6 to i16
-  %8 = load i8* @sc, align 1
-  %9 = sext i8 %8 to i16
-  %10 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %pair11 = cmpxchg i16* %10, i16 %7, i16 %9 monotonic monotonic
-  %11 = extractvalue { i16, i1 } %pair11, 0
-  store i16 %11, i16* @ss, align 2
-  %12 = load i8* @uc, align 1
-  %13 = zext i8 %12 to i16
-  %14 = load i8* @sc, align 1
-  %15 = sext i8 %14 to i16
-  %16 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %pair17 = cmpxchg i16* %16, i16 %13, i16 %15 monotonic monotonic
-  %17 = extractvalue { i16, i1 } %pair17, 0
-  store i16 %17, i16* @us, align 2
-  %18 = load i8* @uc, align 1
-  %19 = zext i8 %18 to i32
-  %20 = load i8* @sc, align 1
-  %21 = sext i8 %20 to i32
-  %22 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %pair23 = cmpxchg i32* %22, i32 %19, i32 %21 monotonic monotonic
-  %23 = extractvalue { i32, i1 } %pair23, 0
-  store i32 %23, i32* @si, align 4
-  %24 = load i8* @uc, align 1
-  %25 = zext i8 %24 to i32
-  %26 = load i8* @sc, align 1
-  %27 = sext i8 %26 to i32
-  %28 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %pair29 = cmpxchg i32* %28, i32 %25, i32 %27 monotonic monotonic
-  %29 = extractvalue { i32, i1 } %pair29, 0
-  store i32 %29, i32* @ui, align 4
-  %30 = load i8* @uc, align 1
-  %31 = zext i8 %30 to i32
-  %32 = load i8* @sc, align 1
-  %33 = sext i8 %32 to i32
-  %34 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %pair35 = cmpxchg i32* %34, i32 %31, i32 %33 monotonic monotonic
-  %35 = extractvalue { i32, i1 } %pair35, 0
-  store i32 %35, i32* @sl, align 4
-  %36 = load i8* @uc, align 1
-  %37 = zext i8 %36 to i32
-  %38 = load i8* @sc, align 1
-  %39 = sext i8 %38 to i32
-  %40 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %pair41 = cmpxchg i32* %40, i32 %37, i32 %39 monotonic monotonic
-  %41 = extractvalue { i32, i1 } %pair41, 0
-  store i32 %41, i32* @ul, align 4
-  %42 = load i8* @uc, align 1
-  %43 = load i8* @sc, align 1
-  %pair44 = cmpxchg i8* @sc, i8 %42, i8 %43 monotonic monotonic
-  %44 = extractvalue { i8, i1 } %pair44, 0
-  %45 = icmp eq i8 %44, %42
-  %46 = zext i1 %45 to i32
-  store i32 %46, i32* @ui, align 4
-  %47 = load i8* @uc, align 1
-  %48 = load i8* @sc, align 1
-  %pair49 = cmpxchg i8* @uc, i8 %47, i8 %48 monotonic monotonic
-  %49 = extractvalue { i8, i1 } %pair49, 0
-  %50 = icmp eq i8 %49, %47
-  %51 = zext i1 %50 to i32
-  store i32 %51, i32* @ui, align 4
-  %52 = load i8* @uc, align 1
-  %53 = zext i8 %52 to i16
-  %54 = load i8* @sc, align 1
-  %55 = sext i8 %54 to i16
-  %56 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %pair57 = cmpxchg i16* %56, i16 %53, i16 %55 monotonic monotonic
-  %57 = extractvalue { i16, i1 } %pair57, 0
-  %58 = icmp eq i16 %57, %53
-  %59 = zext i1 %58 to i32
-  store i32 %59, i32* @ui, align 4
-  %60 = load i8* @uc, align 1
-  %61 = zext i8 %60 to i16
-  %62 = load i8* @sc, align 1
-  %63 = sext i8 %62 to i16
-  %64 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %pair65 = cmpxchg i16* %64, i16 %61, i16 %63 monotonic monotonic
-  %65 = extractvalue { i16, i1 } %pair65, 0
-  %66 = icmp eq i16 %65, %61
-  %67 = zext i1 %66 to i32
-  store i32 %67, i32* @ui, align 4
-  %68 = load i8* @uc, align 1
-  %69 = zext i8 %68 to i32
-  %70 = load i8* @sc, align 1
-  %71 = sext i8 %70 to i32
-  %72 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %pair73 = cmpxchg i32* %72, i32 %69, i32 %71 monotonic monotonic
-  %73 = extractvalue { i32, i1 } %pair73, 0
-  %74 = icmp eq i32 %73, %69
-  %75 = zext i1 %74 to i32
-  store i32 %75, i32* @ui, align 4
-  %76 = load i8* @uc, align 1
-  %77 = zext i8 %76 to i32
-  %78 = load i8* @sc, align 1
-  %79 = sext i8 %78 to i32
-  %80 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %pair81 = cmpxchg i32* %80, i32 %77, i32 %79 monotonic monotonic
-  %81 = extractvalue { i32, i1 } %pair81, 0
-  %82 = icmp eq i32 %81, %77
-  %83 = zext i1 %82 to i32
-  store i32 %83, i32* @ui, align 4
-  %84 = load i8* @uc, align 1
-  %85 = zext i8 %84 to i32
-  %86 = load i8* @sc, align 1
-  %87 = sext i8 %86 to i32
-  %88 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %pair89 = cmpxchg i32* %88, i32 %85, i32 %87 monotonic monotonic
-  %89 = extractvalue { i32, i1 } %pair89, 0
-  %90 = icmp eq i32 %89, %85
-  %91 = zext i1 %90 to i32
-  store i32 %91, i32* @ui, align 4
-  %92 = load i8* @uc, align 1
-  %93 = zext i8 %92 to i32
-  %94 = load i8* @sc, align 1
-  %95 = sext i8 %94 to i32
-  %96 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %pair97 = cmpxchg i32* %96, i32 %93, i32 %95 monotonic monotonic
-  %97 = extractvalue { i32, i1 } %pair97, 0
-  %98 = icmp eq i32 %97, %93
-  %99 = zext i1 %98 to i32
-  store i32 %99, i32* @ui, align 4
-  br label %return
-
-return:                                           ; preds = %entry
-  ret void
-}
-
-define void @test_lock() nounwind {
-entry:
-  %0 = atomicrmw xchg i8* @sc, i8 1 monotonic
-  store i8 %0, i8* @sc, align 1
-  %1 = atomicrmw xchg i8* @uc, i8 1 monotonic
-  store i8 %1, i8* @uc, align 1
-  %2 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %3 = atomicrmw xchg i16* %2, i16 1 monotonic
-  store i16 %3, i16* @ss, align 2
-  %4 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %5 = atomicrmw xchg i16* %4, i16 1 monotonic
-  store i16 %5, i16* @us, align 2
-  %6 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %7 = atomicrmw xchg i32* %6, i32 1 monotonic
-  store i32 %7, i32* @si, align 4
-  %8 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %9 = atomicrmw xchg i32* %8, i32 1 monotonic
-  store i32 %9, i32* @ui, align 4
-  %10 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %11 = atomicrmw xchg i32* %10, i32 1 monotonic
-  store i32 %11, i32* @sl, align 4
-  %12 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %13 = atomicrmw xchg i32* %12, i32 1 monotonic
-  store i32 %13, i32* @ul, align 4
-  fence seq_cst
-  store volatile i8 0, i8* @sc, align 1
-  store volatile i8 0, i8* @uc, align 1
-  %14 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  store volatile i16 0, i16* %14, align 2
-  %15 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  store volatile i16 0, i16* %15, align 2
-  %16 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  store volatile i32 0, i32* %16, align 4
-  %17 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  store volatile i32 0, i32* %17, align 4
-  %18 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  store volatile i32 0, i32* %18, align 4
-  %19 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  store volatile i32 0, i32* %19, align 4
-  %20 = bitcast i8* bitcast (i64* @sll to i8*) to i64*
-  store volatile i64 0, i64* %20, align 8
-  %21 = bitcast i8* bitcast (i64* @ull to i8*) to i64*
-  store volatile i64 0, i64* %21, align 8
-  br label %return
-
-return:                                           ; preds = %entry
-  ret void
-}
diff --git a/test/CodeGen/PowerPC/anon_aggr.ll b/test/CodeGen/PowerPC/anon_aggr.ll
index 3bae5c6..6c4f140 100644
--- a/test/CodeGen/PowerPC/anon_aggr.ll
+++ b/test/CodeGen/PowerPC/anon_aggr.ll
@@ -62,8 +62,7 @@ unequal:
 }
 
 ; CHECK-LABEL: func2:
-; CHECK: addi [[REG1:[0-9]+]], 1, 64
-; CHECK: ld [[REG2:[0-9]+]], 8([[REG1]])
+; CHECK: ld [[REG2:[0-9]+]], 72(1)
 ; CHECK: cmpld {{[0-9]+}}, 4, [[REG2]]
 ; CHECK-DAG: std [[REG2]], -[[OFFSET1:[0-9]+]]
 ; CHECK-DAG: std 4, -[[OFFSET2:[0-9]+]]
@@ -82,8 +81,7 @@ unequal:
 ; DARWIN32: lwz r3, -[[OFFSET2]]
 
 ; DARWIN64: _func2:
-; DARWIN64: addi r[[REG1:[0-9]+]], r1, 64
-; DARWIN64: ld r[[REG2:[0-9]+]], 8(r[[REG1]])
+; DARWIN64: ld r[[REG2:[0-9]+]], 72(r1)
 ; DARWIN64: mr
 ; DARWIN64: mr r[[REG3:[0-9]+]], r[[REGA:[0-9]+]]
 ; DARWIN64: cmpld cr{{[0-9]+}}, r[[REGA]], r[[REG2]]
@@ -108,10 +106,8 @@ unequal:
 }
 
 ; CHECK-LABEL: func3:
-; CHECK: addi [[REG1:[0-9]+]], 1, 64
-; CHECK: addi [[REG2:[0-9]+]], 1, 48
-; CHECK: ld [[REG3:[0-9]+]], 8([[REG1]])
-; CHECK: ld [[REG4:[0-9]+]], 8([[REG2]])
+; CHECK: ld [[REG3:[0-9]+]], 72(1)
+; CHECK: ld [[REG4:[0-9]+]], 56(1)
 ; CHECK: cmpld {{[0-9]+}}, [[REG4]], [[REG3]]
 ; CHECK: std [[REG3]], -[[OFFSET1:[0-9]+]](1)
 ; CHECK: std [[REG4]], -[[OFFSET2:[0-9]+]](1)
@@ -130,10 +126,8 @@ unequal:
 ; DARWIN32: lwz r3, -[[OFFSET1]]
 
 ; DARWIN64: _func3:
-; DARWIN64: addi r[[REG1:[0-9]+]], r1, 64
-; DARWIN64: addi r[[REG2:[0-9]+]], r1, 48
-; DARWIN64: ld r[[REG3:[0-9]+]], 8(r[[REG1]])
-; DARWIN64: ld r[[REG4:[0-9]+]], 8(r[[REG2]])
+; DARWIN64: ld r[[REG3:[0-9]+]], 72(r1)
+; DARWIN64: ld r[[REG4:[0-9]+]], 56(r1)
 ; DARWIN64: cmpld cr{{[0-9]+}}, r[[REG4]], r[[REG3]]
 ; DARWIN64: std r[[REG3]], -[[OFFSET1:[0-9]+]]
 ; DARWIN64: std r[[REG4]], -[[OFFSET2:[0-9]+]]
@@ -157,12 +151,11 @@ unequal:
 }
 
 ; CHECK-LABEL: func4:
-; CHECK: addi [[REG1:[0-9]+]], 1, 128
+; CHECK: ld [[REG3:[0-9]+]], 136(1)
 ; CHECK: ld [[REG2:[0-9]+]], 120(1)
-; CHECK: ld [[REG3:[0-9]+]], 8([[REG1]])
 ; CHECK: cmpld {{[0-9]+}}, [[REG2]], [[REG3]]
-; CHECK: std [[REG2]], -[[OFFSET1:[0-9]+]](1)
 ; CHECK: std [[REG3]], -[[OFFSET2:[0-9]+]](1)
+; CHECK: std [[REG2]], -[[OFFSET1:[0-9]+]](1)
 ; CHECK: ld 3, -[[OFFSET1]](1)
 ; CHECK: ld 3, -[[OFFSET2]](1)
 
@@ -178,9 +171,8 @@ unequal:
 ; DARWIN32: lwz r[[REG1]], -[[OFFSET2]]
 
 ; DARWIN64: _func4:
-; DARWIN64: addi r[[REG1:[0-9]+]], r1, 128
 ; DARWIN64: ld r[[REG2:[0-9]+]], 120(r1)
-; DARWIN64: ld r[[REG3:[0-9]+]], 8(r[[REG1]])
+; DARWIN64: ld r[[REG3:[0-9]+]], 136(r1)
 ; DARWIN64: mr r[[REG4:[0-9]+]], r[[REG2]]
 ; DARWIN64: cmpld cr{{[0-9]+}}, r[[REG2]], r[[REG3]]
 ; DARWIN64: std r[[REG4]], -[[OFFSET1:[0-9]+]]
diff --git a/test/CodeGen/PowerPC/asm-constraints.ll b/test/CodeGen/PowerPC/asm-constraints.ll
new file mode 100644
index 0000000..998b618
--- /dev/null
+++ b/test/CodeGen/PowerPC/asm-constraints.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -mcpu=pwr8 | FileCheck %s
+
+; Generated from following C code:
+;
+; void foo (int result, char *addr) {
+;   __asm__ __volatile__ (
+;     "ld%U1%X1 %0,%1\n"
+;     "cmpw %0,%0\n"
+;     "bne- 1f\n"
+;     "1: isync\n"
+;     : "=r" (result)
+;     : "m"(*addr) : "memory", "cr0");
+; }
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+; Check that we accept 'U' and 'X' constraints.
+define void @foo(i32 signext %result, i8* %addr) #0 {
+entry:
+  %result.addr = alloca i32, align 4
+  %addr.addr = alloca i8*, align 8
+  store i32 %result, i32* %result.addr, align 4
+  store i8* %addr, i8** %addr.addr, align 8
+  %0 = load i8** %addr.addr, align 8
+  %1 = call i32 asm sideeffect "ld${1:U}${1:X} $0,$1\0Acmpw $0,$0\0Abne- 1f\0A1: isync\0A", "=r,*m,~{memory},~{cr0}"(i8* %0) #1, !srcloc !1
+  store i32 %1, i32* %result.addr, align 4
+  ret void
+}
+
+; CHECK-LABEL: @foo
+; CHECK: ld [[REG:[0-9]+]],0(4)
+; CHECK-NEXT: cmpw [[REG]],[[REG]]
+; CHECK-NEXT: bne- 1f
+; CHECK-NEXT: 1: isync
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6.0 (trunk 217557)"}
+!1 = metadata !{i32 67, i32 91, i32 110, i32 126}
diff --git a/test/CodeGen/PowerPC/atomic-2.ll b/test/CodeGen/PowerPC/atomic-2.ll
index 843250f..9cb0fa5 100644
--- a/test/CodeGen/PowerPC/atomic-2.ll
+++ b/test/CodeGen/PowerPC/atomic-2.ll
@@ -30,8 +30,9 @@ define void @atomic_store(i64* %mem, i64 %val) nounwind {
 entry:
 ; CHECK: @atomic_store
   store atomic i64 %val, i64* %mem release, align 64
-; CHECK: ldarx
-; CHECK: stdcx.
+; CHECK: sync 1
+; CHECK-NOT: stdcx
+; CHECK: std
   ret void
 }
 
@@ -39,9 +40,9 @@ define i64 @atomic_load(i64* %mem) nounwind {
 entry:
 ; CHECK: @atomic_load
   %tmp = load atomic i64* %mem acquire, align 64
-; CHECK: ldarx
-; CHECK: stdcx.
-; CHECK: stdcx.
+; CHECK-NOT: ldarx
+; CHECK: ld
+; CHECK: sync 1
   ret i64 %tmp
 }
 
diff --git a/test/CodeGen/PowerPC/atomics-fences.ll b/test/CodeGen/PowerPC/atomics-fences.ll
new file mode 100644
index 0000000..862bd17
--- /dev/null
+++ b/test/CodeGen/PowerPC/atomics-fences.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc32 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -mcpu=440 | FileCheck %s --check-prefix=PPC440
+
+; Fences
+define void @fence_acquire() {
+; CHECK-LABEL: fence_acquire
+; CHECK: sync 1
+; PPC440-NOT: sync 1
+; PPC440: msync
+  fence acquire
+  ret void
+}
+define void @fence_release() {
+; CHECK-LABEL: fence_release
+; CHECK: sync 1
+; PPC440-NOT: sync 1
+; PPC440: msync
+  fence release
+  ret void
+}
+define void @fence_seq_cst() {
+; CHECK-LABEL: fence_seq_cst
+; CHECK: sync 0
+; PPC440-NOT: sync 0
+; PPC440: msync
+  fence seq_cst
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/atomics-indexed.ll b/test/CodeGen/PowerPC/atomics-indexed.ll
new file mode 100644
index 0000000..bb9ca04
--- /dev/null
+++ b/test/CodeGen/PowerPC/atomics-indexed.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc32 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32
+; FIXME: -verify-machineinstrs currently fail on ppc64 (mismatched register/instruction).
+; This is already checked for in Atomics-64.ll
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc64 | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64
+
+; In this file, we check that atomic load/store can make use of the indexed
+; versions of the instructions.
+
+; Indexed version of loads
+define i8 @load_x_i8_seq_cst([100000 x i8]* %mem) {
+; CHECK-LABEL: load_x_i8_seq_cst
+; CHECK: sync 0
+; CHECK: lbzx
+; CHECK: sync 1
+  %ptr = getelementptr inbounds [100000 x i8]* %mem, i64 0, i64 90000
+  %val = load atomic i8* %ptr seq_cst, align 1
+  ret i8 %val
+}
+define i16 @load_x_i16_acquire([100000 x i16]* %mem) {
+; CHECK-LABEL: load_x_i16_acquire
+; CHECK: lhzx
+; CHECK: sync 1
+  %ptr = getelementptr inbounds [100000 x i16]* %mem, i64 0, i64 90000
+  %val = load atomic i16* %ptr acquire, align 2
+  ret i16 %val
+}
+define i32 @load_x_i32_monotonic([100000 x i32]* %mem) {
+; CHECK-LABEL: load_x_i32_monotonic
+; CHECK: lwzx
+; CHECK-NOT: sync
+  %ptr = getelementptr inbounds [100000 x i32]* %mem, i64 0, i64 90000
+  %val = load atomic i32* %ptr monotonic, align 4
+  ret i32 %val
+}
+define i64 @load_x_i64_unordered([100000 x i64]* %mem) {
+; CHECK-LABEL: load_x_i64_unordered
+; PPC32: __sync_
+; PPC64-NOT: __sync_
+; PPC64: ldx
+; CHECK-NOT: sync
+  %ptr = getelementptr inbounds [100000 x i64]* %mem, i64 0, i64 90000
+  %val = load atomic i64* %ptr unordered, align 8
+  ret i64 %val
+}
+
+; Indexed version of stores
+define void @store_x_i8_seq_cst([100000 x i8]* %mem) {
+; CHECK-LABEL: store_x_i8_seq_cst
+; CHECK: sync 0
+; CHECK: stbx
+  %ptr = getelementptr inbounds [100000 x i8]* %mem, i64 0, i64 90000
+  store atomic i8 42, i8* %ptr seq_cst, align 1
+  ret void
+}
+define void @store_x_i16_release([100000 x i16]* %mem) {
+; CHECK-LABEL: store_x_i16_release
+; CHECK: sync 1
+; CHECK: sthx
+  %ptr = getelementptr inbounds [100000 x i16]* %mem, i64 0, i64 90000
+  store atomic i16 42, i16* %ptr release, align 2
+  ret void
+}
+define void @store_x_i32_monotonic([100000 x i32]* %mem) {
+; CHECK-LABEL: store_x_i32_monotonic
+; CHECK-NOT: sync
+; CHECK: stwx
+  %ptr = getelementptr inbounds [100000 x i32]* %mem, i64 0, i64 90000
+  store atomic i32 42, i32* %ptr monotonic, align 4
+  ret void
+}
+define void @store_x_i64_unordered([100000 x i64]* %mem) {
+; CHECK-LABEL: store_x_i64_unordered
+; CHECK-NOT: sync 0
+; CHECK-NOT: sync 1
+; PPC32: __sync_
+; PPC64-NOT: __sync_
+; PPC64: stdx
+  %ptr = getelementptr inbounds [100000 x i64]* %mem, i64 0, i64 90000
+  store atomic i64 42, i64* %ptr unordered, align 8
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/atomics.ll b/test/CodeGen/PowerPC/atomics.ll
new file mode 100644
index 0000000..5f6a6a4
--- /dev/null
+++ b/test/CodeGen/PowerPC/atomics.ll
@@ -0,0 +1,137 @@
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc32 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32
+; FIXME: -verify-machineinstrs currently fail on ppc64 (mismatched register/instruction).
+; This is already checked for in Atomics-64.ll
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc64 | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64
+
+; FIXME: we don't currently check for the operations themselves with CHECK-NEXT,
+;   because they are implemented in a very messy way with lwarx/stwcx.
+;   It should be fixed soon in another patch.
+
+; We first check loads, for all sizes from i8 to i64.
+; We also vary orderings to check for barriers.
+define i8 @load_i8_unordered(i8* %mem) {
+; CHECK-LABEL: load_i8_unordered
+; CHECK: lbz
+; CHECK-NOT: sync
+  %val = load atomic i8* %mem unordered, align 1
+  ret i8 %val
+}
+define i16 @load_i16_monotonic(i16* %mem) {
+; CHECK-LABEL: load_i16_monotonic
+; CHECK: lhz
+; CHECK-NOT: sync
+  %val = load atomic i16* %mem monotonic, align 2
+  ret i16 %val
+}
+define i32 @load_i32_acquire(i32* %mem) {
+; CHECK-LABEL: load_i32_acquire
+; CHECK: lwz
+  %val = load atomic i32* %mem acquire, align 4
+; CHECK: sync 1
+  ret i32 %val
+}
+define i64 @load_i64_seq_cst(i64* %mem) {
+; CHECK-LABEL: load_i64_seq_cst
+; CHECK: sync 0
+; PPC32: __sync_
+; PPC64-NOT: __sync_
+; PPC64: ld
+  %val = load atomic i64* %mem seq_cst, align 8
+; CHECK: sync 1
+  ret i64 %val
+}
+
+; Stores
+define void @store_i8_unordered(i8* %mem) {
+; CHECK-LABEL: store_i8_unordered
+; CHECK-NOT: sync
+; CHECK: stb
+  store atomic i8 42, i8* %mem unordered, align 1
+  ret void
+}
+define void @store_i16_monotonic(i16* %mem) {
+; CHECK-LABEL: store_i16_monotonic
+; CHECK-NOT: sync
+; CHECK: sth
+  store atomic i16 42, i16* %mem monotonic, align 2
+  ret void
+}
+define void @store_i32_release(i32* %mem) {
+; CHECK-LABEL: store_i32_release
+; CHECK: sync 1
+; CHECK: stw
+  store atomic i32 42, i32* %mem release, align 4
+  ret void
+}
+define void @store_i64_seq_cst(i64* %mem) {
+; CHECK-LABEL: store_i64_seq_cst
+; CHECK: sync 0
+; PPC32: __sync_
+; PPC64-NOT: __sync_
+; PPC64: std
+  store atomic i64 42, i64* %mem seq_cst, align 8
+  ret void
+}
+
+; Atomic CmpXchg
+define i8 @cas_strong_i8_sc_sc(i8* %mem) {
+; CHECK-LABEL: cas_strong_i8_sc_sc
+; CHECK: sync 0
+  %val = cmpxchg i8* %mem, i8 0, i8 1 seq_cst seq_cst
+; CHECK: sync 1
+  %loaded = extractvalue { i8, i1} %val, 0
+  ret i8 %loaded
+}
+define i16 @cas_weak_i16_acquire_acquire(i16* %mem) {
+; CHECK-LABEL: cas_weak_i16_acquire_acquire
+;CHECK-NOT: sync
+  %val = cmpxchg weak i16* %mem, i16 0, i16 1 acquire acquire
+; CHECK: sync 1
+  %loaded = extractvalue { i16, i1} %val, 0
+  ret i16 %loaded
+}
+define i32 @cas_strong_i32_acqrel_acquire(i32* %mem) {
+; CHECK-LABEL: cas_strong_i32_acqrel_acquire
+; CHECK: sync 1
+  %val = cmpxchg i32* %mem, i32 0, i32 1 acq_rel acquire
+; CHECK: sync 1
+  %loaded = extractvalue { i32, i1} %val, 0
+  ret i32 %loaded
+}
+define i64 @cas_weak_i64_release_monotonic(i64* %mem) {
+; CHECK-LABEL: cas_weak_i64_release_monotonic
+; CHECK: sync 1
+  %val = cmpxchg weak i64* %mem, i64 0, i64 1 release monotonic
+; CHECK-NOT: [sync ]
+  %loaded = extractvalue { i64, i1} %val, 0
+  ret i64 %loaded
+}
+
+; AtomicRMW
+define i8 @add_i8_monotonic(i8* %mem, i8 %operand) {
+; CHECK-LABEL: add_i8_monotonic
+; CHECK-NOT: sync
+  %val = atomicrmw add i8* %mem, i8 %operand monotonic
+  ret i8 %val
+}
+define i16 @xor_i16_seq_cst(i16* %mem, i16 %operand) {
+; CHECK-LABEL: xor_i16_seq_cst
+; CHECK: sync 0
+  %val = atomicrmw xor i16* %mem, i16 %operand seq_cst
+; CHECK: sync 1
+  ret i16 %val
+}
+define i32 @xchg_i32_acq_rel(i32* %mem, i32 %operand) {
+; CHECK-LABEL: xchg_i32_acq_rel
+; CHECK: sync 1
+  %val = atomicrmw xchg i32* %mem, i32 %operand acq_rel
+; CHECK: sync 1
+  ret i32 %val
+}
+define i64 @and_i64_release(i64* %mem, i64 %operand) {
+; CHECK-LABEL: and_i64_release
+; CHECK: sync 1
+  %val = atomicrmw and i64* %mem, i64 %operand release
+; CHECK-NOT: [sync ]
+  ret i64 %val
+}
diff --git a/test/CodeGen/PowerPC/available-externally.ll b/test/CodeGen/PowerPC/available-externally.ll
index abed0de..53c4359 100644
--- a/test/CodeGen/PowerPC/available-externally.ll
+++ b/test/CodeGen/PowerPC/available-externally.ll
@@ -1,7 +1,8 @@
 ; RUN: llc < %s -relocation-model=static | FileCheck %s -check-prefix=STATIC
-; RUN: llc < %s -relocation-model=pic | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -relocation-model=pic -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -relocation-model=pic -mtriple=powerpc-unknown-linux | FileCheck %s -check-prefix=PICELF
 ; RUN: llc < %s -relocation-model=pic -mtriple=powerpc64-apple-darwin8 | FileCheck %s -check-prefix=PIC64
-; RUN: llc < %s -relocation-model=dynamic-no-pic | FileCheck %s -check-prefix=DYNAMIC
+; RUN: llc < %s -relocation-model=dynamic-no-pic -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=DYNAMIC
 ; RUN: llc < %s -relocation-model=dynamic-no-pic -mtriple=powerpc64-apple-darwin8 | FileCheck %s -check-prefix=DYNAMIC64
 ; PR4482
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
@@ -18,6 +19,10 @@ entry:
 ; PIC: bl L_exact_log2$stub
 ; PIC: blr
 
+; PICELF: foo:
+; PICELF: bl exact_log2@PLT
+; PICELF: blr
+
 ; PIC64: _foo:
 ; PIC64: bl L_exact_log2$stub
 ; PIC64: blr
diff --git a/test/CodeGen/PowerPC/blockaddress.ll b/test/CodeGen/PowerPC/blockaddress.ll
new file mode 100644
index 0000000..c1981e2
--- /dev/null
+++ b/test/CodeGen/PowerPC/blockaddress.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -code-model=small -march=ppc64 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=SMALL
+; RUN: llc < %s -code-model=medium -march=ppc64 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=MEDIUM
+; RUN: llc < %s -code-model=large -march=ppc64 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=MEDIUM
+; RUN: llc < %s -code-model=small -march=ppc64 -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=SMALL
+; RUN: llc < %s -code-model=medium -march=ppc64 -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=MEDIUM
+; RUN: llc < %s -code-model=large -march=ppc64 -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=MEDIUM
+
+define i8* @test() {
+entry:
+  br label %here
+
+here:                                             ; preds = %entry
+; MEDIUM: .Ltmp[[TMP0:[0-9]+]]:
+; MEDIUM: addis [[R0:[0-9]+]], 2, .LC[[LC0:[0-9]+]]@toc@ha
+; MEDIUM: ld 3, .LC[[LC0]]@toc@l([[R0]])
+; MEDIUM: blr
+; MEDIUM: .LC[[LC0]]:
+; MEDIUM: .tc .Ltmp[[TMP0]][TC],.Ltmp[[TMP0]]
+; SMALL: .Ltmp[[TMP0:[0-9]+]]:
+; SMALL: ld 3, .LC[[LC0:[0-9]+]]@toc(2)
+; SMALL: blr
+; SMALL: .LC[[LC0]]:
+; SMALL: .tc .Ltmp[[TMP0]][TC],.Ltmp[[TMP0]]
+  ret i8* blockaddress(@test, %here)
+}
+
diff --git a/test/CodeGen/PowerPC/buildvec_canonicalize.ll b/test/CodeGen/PowerPC/buildvec_canonicalize.ll
index e155a35..b70671b 100644
--- a/test/CodeGen/PowerPC/buildvec_canonicalize.ll
+++ b/test/CodeGen/PowerPC/buildvec_canonicalize.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 -mattr=+altivec --enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mattr=+altivec --enable-unsafe-fp-math | FileCheck %s
 
 define void @VXOR(<4 x float>* %P1, <4 x i32>* %P2, <4 x float>* %P3) {
         %tmp = load <4 x float>* %P3            ; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/PowerPC/byval-aliased.ll b/test/CodeGen/PowerPC/byval-aliased.ll
new file mode 100644
index 0000000..9ef2f02
--- /dev/null
+++ b/test/CodeGen/PowerPC/byval-aliased.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s
+target datalayout = "E-m:o-p:32:32-f64:32:64-n32"
+target triple = "powerpc-apple-macosx10.5.0"
+ 
+%struct.sm = type { i8, i8 }
+ 
+; Function Attrs: nounwind ssp
+define void @foo(%struct.sm* byval %s) #0 {
+entry:
+  %a = getelementptr inbounds %struct.sm* %s, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv2 = zext i8 %0 to i32
+  %add = add nuw nsw i32 %conv2, 3
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %a, align 1
+  call void @bar(%struct.sm* byval %s, %struct.sm* byval %s) #1
+  ret void
+}
+
+; CHECK-LABEL: @foo
+; CHECK: stb {{r[0-9]+}}, [[OFF:[0-9]+]]({{r[3]?1}})
+; CHECK: lhz r3, [[OFF]]({{r[3]?1}})
+; CHECK: bl _bar
+; CHECK: blr
+ 
+declare void @bar(%struct.sm* byval, %struct.sm* byval)
+ 
+attributes #0 = { nounwind ssp }
+attributes #1 = { nounwind }
+ 
diff --git a/test/CodeGen/PowerPC/complex-return.ll b/test/CodeGen/PowerPC/complex-return.ll
index 5ac7524..9d25e61 100644
--- a/test/CodeGen/PowerPC/complex-return.ll
+++ b/test/CodeGen/PowerPC/complex-return.ll
@@ -24,10 +24,10 @@ entry:
 }
 
 ; CHECK-LABEL: foo:
+; CHECK: lfd 1
+; CHECK: lfd 2
 ; CHECK: lfd 3
 ; CHECK: lfd 4
-; CHECK: lfd 2
-; CHECK: lfd 1
 
 define { float, float } @oof() nounwind {
 entry:
diff --git a/test/CodeGen/PowerPC/copysignl.ll b/test/CodeGen/PowerPC/copysignl.ll
index 4b801b7..e280f83 100644
--- a/test/CodeGen/PowerPC/copysignl.ll
+++ b/test/CodeGen/PowerPC/copysignl.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu -mattr=+vsx < %s | FileCheck %s -check-prefix=CHECK-VSX
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -11,6 +12,9 @@ entry:
 ; CHECK-LABEL: @foo_d_ll
 ; CHECK: fcpsgn 1, 3, 1
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_d_ll
+; CHECK-VSX: xscpsgndp 1, 3, 1
+; CHECK-VSX: blr
 }
 
 declare ppc_fp128 @copysignl(ppc_fp128, ppc_fp128) #0
@@ -24,6 +28,9 @@ entry:
 ; CHECK-LABEL: @foo_dl
 ; CHECK: fcpsgn 1, 2, 1
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_dl
+; CHECK-VSX: xscpsgndp 1, 2, 1
+; CHECK-VSX: blr
 }
 
 declare double @copysign(double, double) #0
@@ -37,6 +44,9 @@ entry:
 ; CHECK-LABEL: @foo_ll
 ; CHECK: bl copysignl
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_ll
+; CHECK-VSX: bl copysignl
+; CHECK-VSX: blr
 }
 
 define ppc_fp128 @foo_ld(double %a, double %b) #0 {
@@ -49,6 +59,9 @@ entry:
 ; CHECK-LABEL: @foo_ld
 ; CHECK: bl copysignl
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_ld
+; CHECK-VSX: bl copysignl
+; CHECK-VSX: blr
 }
 
 define ppc_fp128 @foo_lf(double %a, float %b) #0 {
@@ -61,6 +74,9 @@ entry:
 ; CHECK-LABEL: @foo_lf
 ; CHECK: bl copysignl
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_lf
+; CHECK-VSX: bl copysignl
+; CHECK-VSX: blr
 }
 
 attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/PowerPC/dbg.ll b/test/CodeGen/PowerPC/dbg.ll
index 6beea55..04338a6 100644
--- a/test/CodeGen/PowerPC/dbg.ll
+++ b/test/CodeGen/PowerPC/dbg.ll
@@ -6,34 +6,34 @@ target triple = "powerpc64-unknown-linux-gnu"
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readnone {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !15), !dbg !17
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !16), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !17
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
   %add = add nsw i32 %argc, 1, !dbg !19
   ret i32 %add, !dbg !19
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{i32 720913, metadata !21, i32 12, metadata !"clang version 3.1", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, metadata !"", metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1\001\00\000\00\000", metadata !21, metadata !1, metadata !1, metadata !3, metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !21, null, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !13, i32 0} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 720937, metadata !21} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\00256\001\000", metadata !21, null, metadata !7, null, i32 (i32, i8**)* @main, null, null, metadata !13} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !21} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9, metadata !10}
-!9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !12} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{metadata !"0x24\00char\000\008\008\000\000\008", null, null} ; [ DW_TAG_base_type ]
 !13 = metadata !{metadata !15, metadata !16}
-!15 = metadata !{i32 721153, metadata !5, metadata !"argc", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{i32 721153, metadata !5, metadata !"argv", metadata !6, i32 33554433, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{metadata !"0x101\00argc\0016777217\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{metadata !"0x101\00argv\0033554433\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
 !17 = metadata !{i32 1, i32 14, metadata !5, null}
 !18 = metadata !{i32 1, i32 26, metadata !5, null}
 !19 = metadata !{i32 2, i32 3, metadata !20, null}
-!20 = metadata !{i32 720907, metadata !21, metadata !5, i32 1, i32 34, i32 0} ; [ DW_TAG_lexical_block ]
+!20 = metadata !{metadata !"0xb\001\0034\000", metadata !21, metadata !5} ; [ DW_TAG_lexical_block ]
 !21 = metadata !{metadata !"dbg.c", metadata !"/src"}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/PowerPC/empty-functions.ll b/test/CodeGen/PowerPC/empty-functions.ll
index 3a2907d..e32a847 100644
--- a/test/CodeGen/PowerPC/empty-functions.ll
+++ b/test/CodeGen/PowerPC/empty-functions.ll
@@ -1,12 +1,43 @@
-; RUN: llc < %s -mtriple=powerpc-apple-darwin | FileCheck -check-prefix=CHECK-NO-FP %s
-; RUN: llc < %s -mtriple=powerpc-apple-darwin -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=powerpc-apple-darwin | FileCheck -check-prefix=CHECK-MACHO %s
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -disable-fp-elim | FileCheck -check-prefix=CHECK-MACHO %s
+; RUN: llc < %s -mtriple=powerpc-linux-gnu | FileCheck -check-prefix=LINUX-NO-FP %s
+; RUN: llc < %s -mtriple=powerpc-linux-gnu -disable-fp-elim | FileCheck -check-prefix=LINUX-FP %s
 
 define void @func() {
 entry:
   unreachable
 }
-; CHECK-NO-FP:     _func:
-; CHECK-NO-FP:     nop
 
-; CHECK-FP:      _func:
-; CHECK-FP:      nop
+; MachO cannot handle an empty function.
+; CHECK-MACHO:     _func:
+; CHECK-MACHO-NEXT: .cfi_startproc
+; CHECK-MACHO-NEXT: {{^}};
+; CHECK-MACHO-NEXT:     nop
+; CHECK-MACHO-NEXT: .cfi_endproc
+
+; An empty function is perfectly fine on ELF.
+; LINUX-NO-FP: func:
+; LINUX-NO-FP-NEXT: .cfi_startproc
+; LINUX-NO-FP-NEXT: {{^}}#
+; LINUX-NO-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-NO-FP-NEXT: .size   func, .L{{.*}}-func
+; LINUX-NO-FP-NEXT: .cfi_endproc
+
+; A cfi directive can point to the end of a function. It (and in fact the
+; entire body) could be optimized out because of the unreachable, but we
+; don't do it right now.
+; LINUX-FP: func:
+; LINUX-FP-NEXT: .cfi_startproc
+; LINUX-FP-NEXT: {{^}}#
+; LINUX-FP-NEXT: stw 31, -4(1)
+; LINUX-FP-NEXT: stwu 1, -16(1)
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT:  .cfi_def_cfa_offset 16
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_offset r31, -4
+; LINUX-FP-NEXT: mr 31, 1
+; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_def_cfa_register r31
+; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .size   func, .Ltmp3-func
+; LINUX-FP-NEXT: .cfi_endproc
diff --git a/test/CodeGen/PowerPC/fabs.ll b/test/CodeGen/PowerPC/fabs.ll
index ddcce74..36aa23d 100644
--- a/test/CodeGen/PowerPC/fabs.ll
+++ b/test/CodeGen/PowerPC/fabs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin | grep "fabs f1, f1"
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mtriple=powerpc-apple-darwin | grep "fabs f1, f1"
 
 define double @fabs(double %f) {
 entry:
diff --git a/test/CodeGen/PowerPC/fast-isel-call.ll b/test/CodeGen/PowerPC/fast-isel-call.ll
index 33a8ba9..b2cc75e 100644
--- a/test/CodeGen/PowerPC/fast-isel-call.ll
+++ b/test/CodeGen/PowerPC/fast-isel-call.ll
@@ -1,4 +1,8 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+; FIXME: FastISel currently returns false if it hits code that uses VSX
+; registers and with -fast-isel-abort turned on the test case will then fail.
+; When fastisel better supports VSX fix up this test case.
+;
+; RUN: llc < %s -O0 -verify-machineinstrs -mattr=-vsx -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
 
 define i32 @t1(i8 signext %a) nounwind {
   %1 = sext i8 %a to i32
@@ -57,11 +61,11 @@ entry:
 ; ELF64: t10
   %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70)
 ; ELF64: li 3, 0
-; ELF64: li 4, 248
-; ELF64: li 5, 187
+; ELF64: li 4, -8
+; ELF64: li 5, -69
 ; ELF64: li 6, 28
 ; ELF64: li 7, 40
-; ELF64: li 8, 186
+; ELF64: li 8, -70
 ; ELF64: rldicl 3, 3, 0, 56
 ; ELF64: rldicl 4, 4, 0, 56
 ; ELF64: rldicl 5, 5, 0, 56
diff --git a/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll b/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
index 33f7a79..c1f6b63 100644
--- a/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
+++ b/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
@@ -1,5 +1,8 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
-
+; FIXME: FastISel currently returns false if it hits code that uses VSX
+; registers and with -fast-isel-abort turned on the test case will then fail.
+; When fastisel better supports VSX fix up this test case.
+;
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s --check-prefix=ELF64
 define void @t1a(float %a) uwtable ssp {
 entry:
 ; ELF64: t1a
diff --git a/test/CodeGen/PowerPC/fast-isel-conversion.ll b/test/CodeGen/PowerPC/fast-isel-conversion.ll
index 5e00675..b0e29c1 100644
--- a/test/CodeGen/PowerPC/fast-isel-conversion.ll
+++ b/test/CodeGen/PowerPC/fast-isel-conversion.ll
@@ -1,5 +1,10 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
-; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=970 | FileCheck %s --check-prefix=PPC970
+; FIXME: FastISel currently returns false if it hits code that uses VSX
+; registers and with -fast-isel-abort turned on the test case will then fail.
+; When fastisel better supports VSX fix up this test case.
+;
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s --check-prefix=ELF64
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx | FileCheck %s --check-prefix=ELF64LE
+; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=970 -mattr=-vsx | FileCheck %s --check-prefix=PPC970
 
 ;; Tests for 970 don't use -fast-isel-abort because we intentionally punt
 ;; to SelectionDAG in some cases.
@@ -9,12 +14,16 @@
 define void @sitofp_single_i64(i64 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_single_i64
+; ELF64LE: sitofp_single_i64
 ; PPC970: sitofp_single_i64
   %b.addr = alloca float, align 4
   %conv = sitofp i64 %a to float
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfids
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfids
 ; PPC970: std
 ; PPC970: lfd
 ; PPC970: fcfid
@@ -26,12 +35,20 @@ entry:
 define void @sitofp_single_i32(i32 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_single_i32
+; ELF64LE: sitofp_single_i32
 ; PPC970: sitofp_single_i32
   %b.addr = alloca float, align 4
   %conv = sitofp i32 %a to float
 ; ELF64: std
+; stack offset used to load the float: 65524 = -16 + 4
+; ELF64: ori {{[0-9]+}}, {{[0-9]+}}, 65524 
 ; ELF64: lfiwax
 ; ELF64: fcfids
+; ELF64LE: std
+; stack offset used to load the float: 65520 = -16 + 0
+; ELF64LE: ori {{[0-9]+}}, {{[0-9]+}}, 65520
+; ELF64LE: lfiwax
+; ELF64LE: fcfids
 ; PPC970: std
 ; PPC970: lfd
 ; PPC970: fcfid
@@ -43,6 +60,7 @@ entry:
 define void @sitofp_single_i16(i16 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_single_i16
+; ELF64LE: sitofp_single_i16
 ; PPC970: sitofp_single_i16
   %b.addr = alloca float, align 4
   %conv = sitofp i16 %a to float
@@ -50,6 +68,10 @@ entry:
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfids
+; ELF64LE: extsh
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfids
 ; PPC970: extsh
 ; PPC970: std
 ; PPC970: lfd
@@ -62,6 +84,7 @@ entry:
 define void @sitofp_single_i8(i8 %a) nounwind ssp {
 entry:
 ; ELF64: sitofp_single_i8
+; ELF64LE: sitofp_single_i8
 ; PPC970: sitofp_single_i8
   %b.addr = alloca float, align 4
   %conv = sitofp i8 %a to float
@@ -69,6 +92,10 @@ entry:
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfids
+; ELF64LE: extsb
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfids
 ; PPC970: extsb
 ; PPC970: std
 ; PPC970: lfd
@@ -81,12 +108,20 @@ entry:
 define void @sitofp_double_i32(i32 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_double_i32
+; ELF64LE: sitofp_double_i32
 ; PPC970: sitofp_double_i32
   %b.addr = alloca double, align 8
   %conv = sitofp i32 %a to double
 ; ELF64: std
+; stack offset used to load the float: 65524 = -16 + 4
+; ELF64: ori {{[0-9]+}}, {{[0-9]+}}, 65524
 ; ELF64: lfiwax
 ; ELF64: fcfid
+; ELF64LE: std
+; stack offset used to load the float: 65520 = -16 + 0
+; ELF64LE: ori {{[0-9]+}}, {{[0-9]+}}, 65520
+; ELF64LE: lfiwax
+; ELF64LE: fcfid
 ; PPC970: std
 ; PPC970: lfd
 ; PPC970: fcfid
@@ -97,12 +132,16 @@ entry:
 define void @sitofp_double_i64(i64 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_double_i64
+; ELF64LE: sitofp_double_i64
 ; PPC970: sitofp_double_i64
   %b.addr = alloca double, align 8
   %conv = sitofp i64 %a to double
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfid
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfid
 ; PPC970: std
 ; PPC970: lfd
 ; PPC970: fcfid
@@ -113,6 +152,7 @@ entry:
 define void @sitofp_double_i16(i16 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_double_i16
+; ELF64LE: sitofp_double_i16
 ; PPC970: sitofp_double_i16
   %b.addr = alloca double, align 8
   %conv = sitofp i16 %a to double
@@ -120,6 +160,10 @@ entry:
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfid
+; ELF64LE: extsh
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfid
 ; PPC970: extsh
 ; PPC970: std
 ; PPC970: lfd
@@ -131,6 +175,7 @@ entry:
 define void @sitofp_double_i8(i8 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_double_i8
+; ELF64LE: sitofp_double_i8
 ; PPC970: sitofp_double_i8
   %b.addr = alloca double, align 8
   %conv = sitofp i8 %a to double
@@ -138,6 +183,10 @@ entry:
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfid
+; ELF64LE: extsb
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfid
 ; PPC970: extsb
 ; PPC970: std
 ; PPC970: lfd
@@ -151,12 +200,16 @@ entry:
 define void @uitofp_single_i64(i64 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_single_i64
+; ELF64LE: uitofp_single_i64
 ; PPC970: uitofp_single_i64
   %b.addr = alloca float, align 4
   %conv = uitofp i64 %a to float
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidus
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfidus
 ; PPC970-NOT: fcfidus
   store float %conv, float* %b.addr, align 4
   ret void
@@ -165,12 +218,20 @@ entry:
 define void @uitofp_single_i32(i32 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_single_i32
+; ELF64LE: uitofp_single_i32
 ; PPC970: uitofp_single_i32
   %b.addr = alloca float, align 4
   %conv = uitofp i32 %a to float
 ; ELF64: std
+; stack offset used to load the float: 65524 = -16 + 4
+; ELF64: ori {{[0-9]+}}, {{[0-9]+}}, 65524
 ; ELF64: lfiwzx
 ; ELF64: fcfidus
+; ELF64LE: std
+; stack offset used to load the float: 65520 = -16 + 0
+; ELF64LE: ori {{[0-9]+}}, {{[0-9]+}}, 65520
+; ELF64LE: lfiwzx
+; ELF64LE: fcfidus
 ; PPC970-NOT: lfiwzx
 ; PPC970-NOT: fcfidus
   store float %conv, float* %b.addr, align 4
@@ -180,6 +241,7 @@ entry:
 define void @uitofp_single_i16(i16 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_single_i16
+; ELF64LE: uitofp_single_i16
 ; PPC970: uitofp_single_i16
   %b.addr = alloca float, align 4
   %conv = uitofp i16 %a to float
@@ -187,6 +249,10 @@ entry:
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidus
+; ELF64LE: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfidus
 ; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 16, 31
 ; PPC970: std
 ; PPC970: lfd
@@ -199,6 +265,7 @@ entry:
 define void @uitofp_single_i8(i8 %a) nounwind ssp {
 entry:
 ; ELF64: uitofp_single_i8
+; ELF64LE: uitofp_single_i8
 ; PPC970: uitofp_single_i8
   %b.addr = alloca float, align 4
   %conv = uitofp i8 %a to float
@@ -206,6 +273,10 @@ entry:
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidus
+; ELF64LE: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfidus
 ; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 24, 31
 ; PPC970: std
 ; PPC970: lfd
@@ -218,12 +289,16 @@ entry:
 define void @uitofp_double_i64(i64 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_double_i64
+; ELF64LE: uitofp_double_i64
 ; PPC970: uitofp_double_i64
   %b.addr = alloca double, align 8
   %conv = uitofp i64 %a to double
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidu
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfidu
 ; PPC970-NOT: fcfidu
   store double %conv, double* %b.addr, align 8
   ret void
@@ -232,12 +307,20 @@ entry:
 define void @uitofp_double_i32(i32 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_double_i32
+; ELF64LE: uitofp_double_i32
 ; PPC970: uitofp_double_i32
   %b.addr = alloca double, align 8
   %conv = uitofp i32 %a to double
 ; ELF64: std
+; stack offset used to load the float: 65524 = -16 + 4
+; ELF64: ori {{[0-9]+}}, {{[0-9]+}}, 65524
 ; ELF64: lfiwzx
 ; ELF64: fcfidu
+; ELF64LE: std
+; stack offset used to load the float: 65520 = -16 + 0
+; ELF64LE: ori {{[0-9]+}}, {{[0-9]+}}, 65520
+; ELF64LE: lfiwzx
+; ELF64LE: fcfidu
 ; PPC970-NOT: lfiwzx
 ; PPC970-NOT: fcfidu
   store double %conv, double* %b.addr, align 8
@@ -247,6 +330,7 @@ entry:
 define void @uitofp_double_i16(i16 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_double_i16
+; ELF64LE: uitofp_double_i16
 ; PPC970: uitofp_double_i16
   %b.addr = alloca double, align 8
   %conv = uitofp i16 %a to double
@@ -254,6 +338,10 @@ entry:
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidu
+; ELF64LE: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfidu
 ; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 16, 31
 ; PPC970: std
 ; PPC970: lfd
@@ -265,6 +353,7 @@ entry:
 define void @uitofp_double_i8(i8 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_double_i8
+; ELF64LE: uitofp_double_i8
 ; PPC970: uitofp_double_i8
   %b.addr = alloca double, align 8
   %conv = uitofp i8 %a to double
@@ -272,6 +361,10 @@ entry:
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidu
+; ELF64LE: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfidu
 ; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 24, 31
 ; PPC970: std
 ; PPC970: lfd
@@ -285,12 +378,16 @@ entry:
 define void @fptosi_float_i32(float %a) nounwind ssp {
 entry:
 ; ELF64: fptosi_float_i32
+; ELF64LE: fptosi_float_i32
 ; PPC970: fptosi_float_i32
   %b.addr = alloca i32, align 4
   %conv = fptosi float %a to i32
 ; ELF64: fctiwz
 ; ELF64: stfd
 ; ELF64: lwa
+; ELF64LE: fctiwz
+; ELF64LE: stfd
+; ELF64LE: lwa
 ; PPC970: fctiwz
 ; PPC970: stfd
 ; PPC970: lwa
@@ -301,12 +398,16 @@ entry:
 define void @fptosi_float_i64(float %a) nounwind ssp {
 entry:
 ; ELF64: fptosi_float_i64
+; ELF64LE: fptosi_float_i64
 ; PPC970: fptosi_float_i64
   %b.addr = alloca i64, align 4
   %conv = fptosi float %a to i64
 ; ELF64: fctidz
 ; ELF64: stfd
 ; ELF64: ld
+; ELF64LE: fctidz
+; ELF64LE: stfd
+; ELF64LE: ld
 ; PPC970: fctidz
 ; PPC970: stfd
 ; PPC970: ld
@@ -317,12 +418,16 @@ entry:
 define void @fptosi_double_i32(double %a) nounwind ssp {
 entry:
 ; ELF64: fptosi_double_i32
+; ELF64LE: fptosi_double_i32
 ; PPC970: fptosi_double_i32
   %b.addr = alloca i32, align 8
   %conv = fptosi double %a to i32
 ; ELF64: fctiwz
 ; ELF64: stfd
 ; ELF64: lwa
+; ELF64LE: fctiwz
+; ELF64LE: stfd
+; ELF64LE: lwa
 ; PPC970: fctiwz
 ; PPC970: stfd
 ; PPC970: lwa
@@ -333,12 +438,16 @@ entry:
 define void @fptosi_double_i64(double %a) nounwind ssp {
 entry:
 ; ELF64: fptosi_double_i64
+; ELF64LE: fptosi_double_i64
 ; PPC970: fptosi_double_i64
   %b.addr = alloca i64, align 8
   %conv = fptosi double %a to i64
 ; ELF64: fctidz
 ; ELF64: stfd
 ; ELF64: ld
+; ELF64LE: fctidz
+; ELF64LE: stfd
+; ELF64LE: ld
 ; PPC970: fctidz
 ; PPC970: stfd
 ; PPC970: ld
@@ -351,12 +460,16 @@ entry:
 define void @fptoui_float_i32(float %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_float_i32
+; ELF64LE: fptoui_float_i32
 ; PPC970: fptoui_float_i32
   %b.addr = alloca i32, align 4
   %conv = fptoui float %a to i32
 ; ELF64: fctiwuz
 ; ELF64: stfd
 ; ELF64: lwz
+; ELF64LE: fctiwuz
+; ELF64LE: stfd
+; ELF64LE: lwz
 ; PPC970: fctidz
 ; PPC970: stfd
 ; PPC970: lwz
@@ -367,12 +480,16 @@ entry:
 define void @fptoui_float_i64(float %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_float_i64
+; ELF64LE: fptoui_float_i64
 ; PPC970: fptoui_float_i64
   %b.addr = alloca i64, align 4
   %conv = fptoui float %a to i64
 ; ELF64: fctiduz
 ; ELF64: stfd
 ; ELF64: ld
+; ELF64LE: fctiduz
+; ELF64LE: stfd
+; ELF64LE: ld
 ; PPC970-NOT: fctiduz
   store i64 %conv, i64* %b.addr, align 4
   ret void
@@ -381,12 +498,16 @@ entry:
 define void @fptoui_double_i32(double %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_double_i32
+; ELF64LE: fptoui_double_i32
 ; PPC970: fptoui_double_i32
   %b.addr = alloca i32, align 8
   %conv = fptoui double %a to i32
 ; ELF64: fctiwuz
 ; ELF64: stfd
 ; ELF64: lwz
+; ELF64LE: fctiwuz
+; ELF64LE: stfd
+; ELF64LE: lwz
 ; PPC970: fctidz
 ; PPC970: stfd
 ; PPC970: lwz
@@ -397,12 +518,16 @@ entry:
 define void @fptoui_double_i64(double %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_double_i64
+; ELF64LE: fptoui_double_i64
 ; PPC970: fptoui_double_i64
   %b.addr = alloca i64, align 8
   %conv = fptoui double %a to i64
 ; ELF64: fctiduz
 ; ELF64: stfd
 ; ELF64: ld
+; ELF64LE: fctiduz
+; ELF64LE: stfd
+; ELF64LE: ld
 ; PPC970-NOT: fctiduz
   store i64 %conv, i64* %b.addr, align 8
   ret void
diff --git a/test/CodeGen/PowerPC/fast-isel-load-store.ll b/test/CodeGen/PowerPC/fast-isel-load-store.ll
index 026b15f..ef702e2 100644
--- a/test/CodeGen/PowerPC/fast-isel-load-store.ll
+++ b/test/CodeGen/PowerPC/fast-isel-load-store.ll
@@ -1,4 +1,8 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+; FIXME: FastISel currently returns false if it hits code that uses VSX
+; registers and with -fast-isel-abort turned on the test case will then fail.
+; When fastisel better supports VSX fix up this test case.
+;
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel -fast-isel-abort -mattr=-vsx -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
 
 ; This test verifies that load/store instructions are properly generated,
 ; and that they pass MI verification.
diff --git a/test/CodeGen/PowerPC/fast-isel-ret.ll b/test/CodeGen/PowerPC/fast-isel-ret.ll
index fa19f8b..ae34fbf 100644
--- a/test/CodeGen/PowerPC/fast-isel-ret.ll
+++ b/test/CodeGen/PowerPC/fast-isel-ret.ll
@@ -1,8 +1,44 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+; FIXME: FastISel currently returns false if it hits code that uses VSX
+; registers and with -fast-isel-abort turned on the test case will then fail.
+; When fastisel better supports VSX fix up this test case.
+;
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s --check-prefix=ELF64
+
+define zeroext i1 @rettrue() nounwind uwtable ssp {
+entry:
+; ELF64-LABEL: rettrue
+; ELF64: li 3, 1
+; ELF64: blr
+  ret i1 true
+}
+
+define zeroext i1 @retfalse() nounwind uwtable ssp {
+entry:
+; ELF64-LABEL: retfalse
+; ELF64: li 3, 0
+; ELF64: blr
+  ret i1 false
+}
+
+define signext i1 @retstrue() nounwind uwtable ssp {
+entry:
+; ELF64-LABEL: retstrue
+; ELF64: li 3, -1
+; ELF64: blr
+  ret i1 true
+}
+
+define signext i1 @retsfalse() nounwind uwtable ssp {
+entry:
+; ELF64-LABEL: retsfalse
+; ELF64: li 3, 0
+; ELF64: blr
+  ret i1 false
+}
 
 define signext i8 @ret2(i8 signext %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret2
+; ELF64-LABEL: ret2
 ; ELF64: extsb
 ; ELF64: blr
   ret i8 %a
@@ -10,7 +46,7 @@ entry:
 
 define zeroext i8 @ret3(i8 signext %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret3
+; ELF64-LABEL: ret3
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
 ; ELF64: blr
   ret i8 %a
@@ -18,7 +54,7 @@ entry:
 
 define signext i16 @ret4(i16 signext %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret4
+; ELF64-LABEL: ret4
 ; ELF64: extsh
 ; ELF64: blr
   ret i16 %a
@@ -26,7 +62,7 @@ entry:
 
 define zeroext i16 @ret5(i16 signext %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret5
+; ELF64-LABEL: ret5
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
 ; ELF64: blr
   ret i16 %a
@@ -34,7 +70,7 @@ entry:
 
 define i16 @ret6(i16 %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret6
+; ELF64-LABEL: ret6
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
 ; ELF64: blr
   ret i16 %a
@@ -42,7 +78,7 @@ entry:
 
 define signext i32 @ret7(i32 signext %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret7
+; ELF64-LABEL: ret7
 ; ELF64: extsw
 ; ELF64: blr
   ret i32 %a
@@ -50,7 +86,7 @@ entry:
 
 define zeroext i32 @ret8(i32 signext %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret8
+; ELF64-LABEL: ret8
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
 ; ELF64: blr
   ret i32 %a
@@ -58,7 +94,7 @@ entry:
 
 define i32 @ret9(i32 %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret9
+; ELF64-LABEL: ret9
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
 ; ELF64: blr
   ret i32 %a
@@ -66,7 +102,7 @@ entry:
 
 define i64 @ret10(i64 %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret10
+; ELF64-LABEL: ret10
 ; ELF64-NOT: exts
 ; ELF64-NOT: rldicl
 ; ELF64: blr
@@ -75,21 +111,21 @@ entry:
 
 define float @ret11(float %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret11
+; ELF64-LABEL: ret11
 ; ELF64: blr
   ret float %a
 }
 
 define double @ret12(double %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret12
+; ELF64-LABEL: ret12
 ; ELF64: blr
   ret double %a
 }
 
 define i8 @ret13() nounwind uwtable ssp {
 entry:
-; ELF64: ret13
+; ELF64-LABEL: ret13
 ; ELF64: li
 ; ELF64: blr
   ret i8 15;
@@ -97,7 +133,7 @@ entry:
 
 define i16 @ret14() nounwind uwtable ssp {
 entry:
-; ELF64: ret14
+; ELF64-LABEL: ret14
 ; ELF64: li
 ; ELF64: blr
   ret i16 -225;
@@ -105,7 +141,7 @@ entry:
 
 define i32 @ret15() nounwind uwtable ssp {
 entry:
-; ELF64: ret15
+; ELF64-LABEL: ret15
 ; ELF64: lis
 ; ELF64: ori
 ; ELF64: blr
@@ -114,7 +150,7 @@ entry:
 
 define i64 @ret16() nounwind uwtable ssp {
 entry:
-; ELF64: ret16
+; ELF64-LABEL: ret16
 ; ELF64: li
 ; ELF64: sldi
 ; ELF64: oris
@@ -125,7 +161,7 @@ entry:
 
 define float @ret17() nounwind uwtable ssp {
 entry:
-; ELF64: ret17
+; ELF64-LABEL: ret17
 ; ELF64: addis
 ; ELF64: lfs
 ; ELF64: blr
@@ -134,7 +170,7 @@ entry:
 
 define double @ret18() nounwind uwtable ssp {
 entry:
-; ELF64: ret18
+; ELF64-LABEL: ret18
 ; ELF64: addis
 ; ELF64: lfd
 ; ELF64: blr
diff --git a/test/CodeGen/PowerPC/fcpsgn.ll b/test/CodeGen/PowerPC/fcpsgn.ll
index f469981..4d4afc6 100644
--- a/test/CodeGen/PowerPC/fcpsgn.ll
+++ b/test/CodeGen/PowerPC/fcpsgn.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -10,6 +11,9 @@ entry:
 ; CHECK-LABEL: @foo_dd
 ; CHECK: fcpsgn 1, 2, 1
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_dd
+; CHECK-VSX: xscpsgndp 1, 2, 1
+; CHECK-VSX: blr
 }
 
 declare double @copysign(double, double) #0
@@ -22,6 +26,9 @@ entry:
 ; CHECK-LABEL: @foo_ss
 ; CHECK: fcpsgn 1, 2, 1
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_ss
+; CHECK-VSX: fcpsgn 1, 2, 1
+; CHECK-VSX: blr
 }
 
 declare float @copysignf(float, float) #0
@@ -35,6 +42,9 @@ entry:
 ; CHECK-LABEL: @foo_sd
 ; CHECK: fcpsgn 1, 2, 1
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_sd
+; CHECK-VSX: fcpsgn 1, 2, 1
+; CHECK-VSX: blr
 }
 
 define double @foo_ds(double %a, float %b) #0 {
@@ -46,6 +56,9 @@ entry:
 ; CHECK-LABEL: @foo_ds
 ; CHECK: fcpsgn 1, 2, 1
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_ds
+; CHECK-VSX: fcpsgn 1, 2, 1
+; CHECK-VSX: blr
 }
 
 attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/PowerPC/fma-mutate.ll b/test/CodeGen/PowerPC/fma-mutate.ll
new file mode 100644
index 0000000..1a391f4
--- /dev/null
+++ b/test/CodeGen/PowerPC/fma-mutate.ll
@@ -0,0 +1,21 @@
+; Test several VSX FMA mutation opportunities.  The first one isn't a
+; reasonable transformation because the killed product register is the
+; same as the FMA target register.  The second one is legal.  The third
+; one doesn't fit the feeding-copy pattern.
+
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math -mattr=+vsx | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare double @llvm.sqrt.f64(double)
+
+define double @foo3(double %a) nounwind {
+  %r = call double @llvm.sqrt.f64(double %a)
+  ret double %r
+
+; CHECK: @foo3
+; CHECK: xsnmsubadp [[REG:[0-9]+]], {{[0-9]+}}, [[REG]]
+; CHECK: xsmaddmdp
+; CHECK: xsmaddadp
+}
+
diff --git a/test/CodeGen/PowerPC/fma.ll b/test/CodeGen/PowerPC/fma.ll
index db19761..ab5251b 100644
--- a/test/CodeGen/PowerPC/fma.ll
+++ b/test/CodeGen/PowerPC/fma.ll
@@ -1,12 +1,21 @@
-; RUN: llc < %s -march=ppc32 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -march=ppc32 -fp-contract=fast -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 | FileCheck -check-prefix=CHECK-VSX %s
+
+declare double @dummy1(double) #0
+declare double @dummy2(double, double) #0
+declare double @dummy3(double, double, double) #0
 
 define double @test_FMADD1(double %A, double %B, double %C) {
 	%D = fmul double %A, %B		; <double> [#uses=1]
-	%E = fadd double %D, %C		; <double> [#uses=1]
+	%E = fadd double %C, %D		; <double> [#uses=1]
 	ret double %E
 ; CHECK-LABEL: test_FMADD1:
 ; CHECK: fmadd
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FMADD1:
+; CHECK-VSX: xsmaddmdp
+; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FMADD2(double %A, double %B, double %C) {
@@ -16,15 +25,38 @@ define double @test_FMADD2(double %A, double %B, double %C) {
 ; CHECK-LABEL: test_FMADD2:
 ; CHECK: fmadd
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FMADD2:
+; CHECK-VSX: xsmaddmdp
+; CHECK-VSX-NEXT: blr
 }
 
-define double @test_FMSUB(double %A, double %B, double %C) {
+define double @test_FMSUB1(double %A, double %B, double %C) {
 	%D = fmul double %A, %B		; <double> [#uses=1]
 	%E = fsub double %D, %C		; <double> [#uses=1]
 	ret double %E
-; CHECK-LABEL: test_FMSUB:
+; CHECK-LABEL: test_FMSUB1:
 ; CHECK: fmsub
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FMSUB1:
+; CHECK-VSX: xsmsubmdp
+; CHECK-VSX-NEXT: blr
+}
+
+define double @test_FMSUB2(double %A, double %B, double %C, double %D) {
+	%E = fmul double %A, %B 	; <double> [#uses=2]
+	%F = fadd double %E, %C 	; <double> [#uses=1]
+	%G = fsub double %E, %D 	; <double> [#uses=1]
+	%H = call double @dummy2(double %F, double %G)      ; <double> [#uses=1]
+	ret double %H
+; CHECK-LABEL: test_FMSUB2:
+; CHECK: fmadd
+; CHECK-NEXT: fmsub
+
+; CHECK-VSX-LABEL: test_FMSUB2:
+; CHECK-VSX: xsmaddadp
+; CHECK-VSX-NEXT: xsmsubmdp
 }
 
 define double @test_FNMADD1(double %A, double %B, double %C) {
@@ -35,6 +67,10 @@ define double @test_FNMADD1(double %A, double %B, double %C) {
 ; CHECK-LABEL: test_FNMADD1:
 ; CHECK: fnmadd
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FNMADD1:
+; CHECK-VSX: xsnmaddmdp
+; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FNMADD2(double %A, double %B, double %C) {
@@ -45,6 +81,10 @@ define double @test_FNMADD2(double %A, double %B, double %C) {
 ; CHECK-LABEL: test_FNMADD2:
 ; CHECK: fnmadd
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FNMADD2:
+; CHECK-VSX: xsnmaddmdp
+; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FNMSUB1(double %A, double %B, double %C) {
@@ -54,6 +94,9 @@ define double @test_FNMSUB1(double %A, double %B, double %C) {
 ; CHECK-LABEL: test_FNMSUB1:
 ; CHECK: fnmsub
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FNMSUB1:
+; CHECK-VSX: xsnmsubmdp
 }
 
 define double @test_FNMSUB2(double %A, double %B, double %C) {
@@ -64,6 +107,10 @@ define double @test_FNMSUB2(double %A, double %B, double %C) {
 ; CHECK-LABEL: test_FNMSUB2:
 ; CHECK: fnmsub
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FNMSUB2:
+; CHECK-VSX: xsnmsubmdp
+; CHECK-VSX-NEXT: blr
 }
 
 define float @test_FNMSUBS(float %A, float %B, float %C) {
@@ -74,4 +121,8 @@ define float @test_FNMSUBS(float %A, float %B, float %C) {
 ; CHECK-LABEL: test_FNMSUBS:
 ; CHECK: fnmsubs
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FNMSUBS:
+; CHECK-VSX: fnmsubs
+; CHECK-VSX-NEXT: blr
 }
diff --git a/test/CodeGen/PowerPC/fmaxnum.ll b/test/CodeGen/PowerPC/fmaxnum.ll
new file mode 100644
index 0000000..1825850
--- /dev/null
+++ b/test/CodeGen/PowerPC/fmaxnum.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=ppc32 -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s
+
+declare float @fmaxf(float, float)
+declare double @fmax(double, double)
+declare ppc_fp128 @fmaxl(ppc_fp128, ppc_fp128)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
+declare ppc_fp128 @llvm.maxnum.ppcf128(ppc_fp128, ppc_fp128)
+
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
+
+; CHECK-LABEL: @test_fmaxf
+; CHECK: bl fmaxf
+define float @test_fmaxf(float %x, float %y) {
+  %z = call float @fmaxf(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_fmax
+; CHECK: bl fmax
+define double @test_fmax(double %x, double %y) {
+  %z = call double @fmax(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_fmaxl
+; CHECK: bl fmaxl
+define ppc_fp128 @test_fmaxl(ppc_fp128 %x, ppc_fp128 %y) {
+  %z = call ppc_fp128 @fmaxl(ppc_fp128 %x, ppc_fp128 %y) readnone
+  ret ppc_fp128 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxf
+; CHECK: bl fmaxf
+define float @test_intrinsic_fmaxf(float %x, float %y) {
+  %z = call float @llvm.maxnum.f32(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax
+; CHECK: bl fmax
+define double @test_intrinsic_fmax(double %x, double %y) {
+  %z = call double @llvm.maxnum.f64(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxl
+; CHECK: bl fmaxl
+define ppc_fp128 @test_intrinsic_fmaxl(ppc_fp128 %x, ppc_fp128 %y) {
+  %z = call ppc_fp128 @llvm.maxnum.ppcf128(ppc_fp128 %x, ppc_fp128 %y) readnone
+  ret ppc_fp128 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxf_v2f32
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+define <2 x float> @test_intrinsic_fmaxf_v2f32(<2 x float> %x, <2 x float> %y) {
+  %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
+  ret <2 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxf_v4f32
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+define <4 x float> @test_intrinsic_fmaxf_v4f32(<4 x float> %x, <4 x float> %y) {
+  %z = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
+  ret <4 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxf_v8f32
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+define <8 x float> @test_intrinsic_fmaxf_v8f32(<8 x float> %x, <8 x float> %y) {
+  %z = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %x, <8 x float> %y) readnone
+  ret <8 x float> %z
+}
diff --git a/test/CodeGen/PowerPC/fminnum.ll b/test/CodeGen/PowerPC/fminnum.ll
new file mode 100644
index 0000000..fe91284
--- /dev/null
+++ b/test/CodeGen/PowerPC/fminnum.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=ppc32 -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s
+
+declare float @fminf(float, float)
+declare double @fmin(double, double)
+declare ppc_fp128 @fminl(ppc_fp128, ppc_fp128)
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare ppc_fp128 @llvm.minnum.ppcf128(ppc_fp128, ppc_fp128)
+
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>)
+
+; CHECK-LABEL: @test_fminf
+; CHECK: bl fminf
+define float @test_fminf(float %x, float %y) {
+  %z = call float @fminf(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_fmin
+; CHECK: bl fmin
+define double @test_fmin(double %x, double %y) {
+  %z = call double @fmin(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_fminl
+; CHECK: bl fminl
+define ppc_fp128 @test_fminl(ppc_fp128 %x, ppc_fp128 %y) {
+  %z = call ppc_fp128 @fminl(ppc_fp128 %x, ppc_fp128 %y) readnone
+  ret ppc_fp128 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_f32
+; CHECK: bl fminf
+define float @test_intrinsic_fmin_f32(float %x, float %y) {
+  %z = call float @llvm.minnum.f32(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_f64
+; CHECK: bl fmin
+define double @test_intrinsic_fmin_f64(double %x, double %y) {
+  %z = call double @llvm.minnum.f64(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_f128
+; CHECK: bl fminl
+define ppc_fp128 @test_intrinsic_fmin_f128(ppc_fp128 %x, ppc_fp128 %y) {
+  %z = call ppc_fp128 @llvm.minnum.ppcf128(ppc_fp128 %x, ppc_fp128 %y) readnone
+  ret ppc_fp128 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fminf_v2f32
+; CHECK: bl fminf
+; CHECK: bl fminf
+define <2 x float> @test_intrinsic_fminf_v2f32(<2 x float> %x, <2 x float> %y) {
+  %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
+  ret <2 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v4f32
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+define <4 x float> @test_intrinsic_fmin_v4f32(<4 x float> %x, <4 x float> %y) {
+  %z = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
+  ret <4 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v8f32
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+define <8 x float> @test_intrinsic_fmin_v8f32(<8 x float> %x, <8 x float> %y) {
+  %z = call <8 x float> @llvm.minnum.v8f32(<8 x float> %x, <8 x float> %y) readnone
+  ret <8 x float> %z
+}
diff --git a/test/CodeGen/PowerPC/fnabs.ll b/test/CodeGen/PowerPC/fnabs.ll
index 9fa2dcb..fc6a04e 100644
--- a/test/CodeGen/PowerPC/fnabs.ll
+++ b/test/CodeGen/PowerPC/fnabs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 | grep fnabs
+; RUN: llc < %s -mattr=-vsx -march=ppc32 | grep fnabs
 
 declare double @fabs(double)
 
diff --git a/test/CodeGen/PowerPC/fp-branch.ll b/test/CodeGen/PowerPC/fp-branch.ll
index 673da02..f585756 100644
--- a/test/CodeGen/PowerPC/fp-branch.ll
+++ b/test/CodeGen/PowerPC/fp-branch.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 | grep fcmp | count 1
+; RUN: llc < %s -mattr=-vsx -march=ppc32 | grep fcmp | count 1
 
 declare i1 @llvm.isunordered.f64(double, double)
 
diff --git a/test/CodeGen/PowerPC/fp_to_uint.ll b/test/CodeGen/PowerPC/fp_to_uint.ll
index 1360b62..187d2d6 100644
--- a/test/CodeGen/PowerPC/fp_to_uint.ll
+++ b/test/CodeGen/PowerPC/fp_to_uint.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=ppc32 | grep fctiwz | count 1
+; RUN: llc < %s -mattr=-vsx -march=ppc32 | grep fctiwz | count 1
+
 
 define i16 @foo(float %a) {
 entry:
diff --git a/test/CodeGen/PowerPC/fsel.ll b/test/CodeGen/PowerPC/fsel.ll
index 8cd43e6..afceb63 100644
--- a/test/CodeGen/PowerPC/fsel.ll
+++ b/test/CodeGen/PowerPC/fsel.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-no-infs-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=CHECK-FM %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-no-infs-fp-math -enable-no-nans-fp-math -mattr=-vsx | FileCheck -check-prefix=CHECK-FM %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-no-infs-fp-math -enable-no-nans-fp-math -mattr=+vsx | FileCheck -check-prefix=CHECK-FM-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -16,6 +17,10 @@ entry:
 ; CHECK-FM: @zerocmp1
 ; CHECK-FM: fsel 1, 1, 2, 3
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @zerocmp1
+; CHECK-FM-VSX: fsel 1, 1, 2, 3
+; CHECK-FM-VSX: blr
 }
 
 define double @zerocmp2(double %a, double %y, double %z) #0 {
@@ -32,6 +37,11 @@ entry:
 ; CHECK-FM: fneg [[REG:[0-9]+]], 1
 ; CHECK-FM: fsel 1, [[REG]], 3, 2
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @zerocmp2
+; CHECK-FM-VSX: xsnegdp [[REG:[0-9]+]], 1
+; CHECK-FM-VSX: fsel 1, [[REG]], 3, 2
+; CHECK-FM-VSX: blr
 }
 
 define double @zerocmp3(double %a, double %y, double %z) #0 {
@@ -49,6 +59,12 @@ entry:
 ; CHECK-FM: fneg [[REG2:[0-9]+]], 1
 ; CHECK-FM: fsel 1, [[REG2]], [[REG]], 3
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @zerocmp3
+; CHECK-FM-VSX: xsnegdp [[REG2:[0-9]+]], 1
+; CHECK-FM-VSX: fsel [[REG:[0-9]+]], 1, 2, 3
+; CHECK-FM-VSX: fsel 1, [[REG2]], [[REG]], 3
+; CHECK-FM-VSX: blr
 }
 
 define double @min1(double %a, double %b) #0 {
@@ -65,6 +81,11 @@ entry:
 ; CHECK-FM: fsub [[REG:[0-9]+]], 2, 1
 ; CHECK-FM: fsel 1, [[REG]], 1, 2
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @min1
+; CHECK-FM-VSX: xssubdp [[REG:[0-9]+]], 2, 1
+; CHECK-FM-VSX: fsel 1, [[REG]], 1, 2
+; CHECK-FM-VSX: blr
 }
 
 define double @max1(double %a, double %b) #0 {
@@ -81,6 +102,11 @@ entry:
 ; CHECK-FM: fsub [[REG:[0-9]+]], 1, 2
 ; CHECK-FM: fsel 1, [[REG]], 1, 2
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @max1
+; CHECK-FM-VSX: xssubdp [[REG:[0-9]+]], 1, 2
+; CHECK-FM-VSX: fsel 1, [[REG]], 1, 2
+; CHECK-FM-VSX: blr
 }
 
 define double @cmp1(double %a, double %b, double %y, double %z) #0 {
@@ -97,6 +123,11 @@ entry:
 ; CHECK-FM: fsub [[REG:[0-9]+]], 1, 2
 ; CHECK-FM: fsel 1, [[REG]], 3, 4
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @cmp1
+; CHECK-FM-VSX: xssubdp [[REG:[0-9]+]], 1, 2
+; CHECK-FM-VSX: fsel 1, [[REG]], 3, 4
+; CHECK-FM-VSX: blr
 }
 
 define double @cmp2(double %a, double %b, double %y, double %z) #0 {
@@ -113,6 +144,11 @@ entry:
 ; CHECK-FM: fsub [[REG:[0-9]+]], 2, 1
 ; CHECK-FM: fsel 1, [[REG]], 4, 3
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @cmp2
+; CHECK-FM-VSX: xssubdp [[REG:[0-9]+]], 2, 1
+; CHECK-FM-VSX: fsel 1, [[REG]], 4, 3
+; CHECK-FM-VSX: blr
 }
 
 define double @cmp3(double %a, double %b, double %y, double %z) #0 {
@@ -131,6 +167,13 @@ entry:
 ; CHECK-FM: fneg [[REG3:[0-9]+]], [[REG]]
 ; CHECK-FM: fsel 1, [[REG3]], [[REG2]], 4
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @cmp3
+; CHECK-FM-VSX: xssubdp [[REG:[0-9]+]], 1, 2
+; CHECK-FM-VSX: xsnegdp [[REG3:[0-9]+]], [[REG]]
+; CHECK-FM-VSX: fsel [[REG2:[0-9]+]], [[REG]], 3, 4
+; CHECK-FM-VSX: fsel 1, [[REG3]], [[REG2]], 4
+; CHECK-FM-VSX: blr
 }
 
 attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/PowerPC/fsqrt.ll b/test/CodeGen/PowerPC/fsqrt.ll
index bf8c4a2..019dfa4 100644
--- a/test/CodeGen/PowerPC/fsqrt.ll
+++ b/test/CodeGen/PowerPC/fsqrt.ll
@@ -1,13 +1,13 @@
 ; fsqrt should be generated when the fsqrt feature is enabled, but not 
 ; otherwise.
 
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=+fsqrt | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=+fsqrt | \
 ; RUN:   grep "fsqrt f1, f1"
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mcpu=g5 | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mtriple=powerpc-apple-darwin8 -mcpu=g5 | \
 ; RUN:   grep "fsqrt f1, f1"
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=-fsqrt | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=-fsqrt | \
 ; RUN:   not grep "fsqrt f1, f1"
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mcpu=g4 | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mtriple=powerpc-apple-darwin8 -mcpu=g4 | \
 ; RUN:   not grep "fsqrt f1, f1"
 
 declare double @llvm.sqrt.f64(double)
diff --git a/test/CodeGen/PowerPC/i64_fp.ll b/test/CodeGen/PowerPC/i64_fp.ll
index d53c948..67f4e0b 100644
--- a/test/CodeGen/PowerPC/i64_fp.ll
+++ b/test/CodeGen/PowerPC/i64_fp.ll
@@ -1,21 +1,21 @@
 ; fcfid and fctid should be generated when the 64bit feature is enabled, but not
 ; otherwise.
 
-; RUN: llc < %s -march=ppc32 -mattr=+64bit | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mattr=+64bit | \
 ; RUN:   grep fcfid
-; RUN: llc < %s -march=ppc32 -mattr=+64bit | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mattr=+64bit | \
 ; RUN:   grep fctidz
-; RUN: llc < %s -march=ppc32 -mcpu=g5 | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mcpu=g5 | \
 ; RUN:   grep fcfid
-; RUN: llc < %s -march=ppc32 -mcpu=g5 | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mcpu=g5 | \
 ; RUN:   grep fctidz
-; RUN: llc < %s -march=ppc32 -mattr=-64bit | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mattr=-64bit | \
 ; RUN:   not grep fcfid
-; RUN: llc < %s -march=ppc32 -mattr=-64bit | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mattr=-64bit | \
 ; RUN:   not grep fctidz
-; RUN: llc < %s -march=ppc32 -mcpu=g4 | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mcpu=g4 | \
 ; RUN:   not grep fcfid
-; RUN: llc < %s -march=ppc32 -mcpu=g4 | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mcpu=g4 | \
 ; RUN:   not grep fctidz
 
 define double @X(double %Y) {
diff --git a/test/CodeGen/PowerPC/mcm-10.ll b/test/CodeGen/PowerPC/mcm-10.ll
index c3ab747..9565ebc 100644
--- a/test/CodeGen/PowerPC/mcm-10.ll
+++ b/test/CodeGen/PowerPC/mcm-10.ll
@@ -22,5 +22,4 @@ entry:
 ; CHECK-NOT: extsw
 ; CHECK: stw {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
 ; CHECK: .type [[VAR]],@object
-; CHECK: .local [[VAR]]
-; CHECK: .comm [[VAR]],4,4
+; CHECK: .lcomm [[VAR]],4,4
diff --git a/test/CodeGen/PowerPC/mcm-12.ll b/test/CodeGen/PowerPC/mcm-12.ll
index b31b605..668b54f 100644
--- a/test/CodeGen/PowerPC/mcm-12.ll
+++ b/test/CodeGen/PowerPC/mcm-12.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mcpu=pwr7 -O1 -code-model=medium <%s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O1 -code-model=medium -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O1 -code-model=medium -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-VSX %s
 
 ; Test peephole optimization for medium code model (32-bit TOC offsets)
 ; for loading a value from the constant pool (TOC-relative).
@@ -16,3 +17,10 @@ entry:
 ; CHECK-LABEL: test_double_const:
 ; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
 ; CHECK: lfd {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+
+; CHECK-VSX: [[VAR:[a-z0-9A-Z_.]+]]:
+; CHECK-VSX: .quad 4562098671269285104
+; CHECK-VSX-LABEL: test_double_const:
+; CHECK-VSX: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
+; CHECK-VSX: addi [[REG1]], {{[0-9]+}}, [[VAR]]@toc@l
+; CHECK-VSX: lxsdx {{[0-9]+}}, 0, [[REG1]]
diff --git a/test/CodeGen/PowerPC/mcm-2.ll b/test/CodeGen/PowerPC/mcm-2.ll
index fee98d8..811600e 100644
--- a/test/CodeGen/PowerPC/mcm-2.ll
+++ b/test/CodeGen/PowerPC/mcm-2.ll
@@ -23,8 +23,7 @@ entry:
 ; MEDIUM: lwz {{[0-9]+}}, 0([[REG2]])
 ; MEDIUM: stw {{[0-9]+}}, 0([[REG2]])
 ; MEDIUM: .type [[VAR]],@object
-; MEDIUM: .local [[VAR]]
-; MEDIUM: .comm [[VAR]],4,4
+; MEDIUM: .lcomm [[VAR]],4,4
 
 ; LARGE-LABEL: test_fn_static:
 ; LARGE: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
@@ -34,6 +33,5 @@ entry:
 ; LARGE: [[VAR]]:
 ; LARGE: .tc [[VAR2:[a-z0-9A-Z_.]+]][TC],[[VAR2]]
 ; LARGE: .type [[VAR2]],@object
-; LARGE: .local [[VAR2]]
-; LARGE: .comm [[VAR2]],4,4
+; LARGE: .lcomm [[VAR2]],4,4
 
diff --git a/test/CodeGen/PowerPC/mcm-4.ll b/test/CodeGen/PowerPC/mcm-4.ll
index 73dd902..e4ceb3a 100644
--- a/test/CodeGen/PowerPC/mcm-4.ll
+++ b/test/CodeGen/PowerPC/mcm-4.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mcpu=pwr7 -O0 -code-model=medium -fast-isel=false <%s | FileCheck -check-prefix=MEDIUM %s
-; RUN: llc -mcpu=pwr7 -O0 -code-model=large -fast-isel=false <%s | FileCheck -check-prefix=LARGE %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium -fast-isel=false -mattr=-vsx <%s | FileCheck -check-prefix=MEDIUM %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium -fast-isel=false -mattr=+vsx <%s | FileCheck -check-prefix=MEDIUM-VSX %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large -fast-isel=false -mattr=-vsx <%s | FileCheck -check-prefix=LARGE %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large -fast-isel=false -mattr=+vsx <%s | FileCheck -check-prefix=LARGE-VSX %s
 
 ; Test correct code generation for medium and large code model
 ; for loading a value from the constant pool (TOC-relative).
@@ -19,9 +21,23 @@ entry:
 ; MEDIUM: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
 ; MEDIUM: lfd {{[0-9]+}}, 0([[REG2]])
 
+; MEDIUM-VSX: [[VAR:[a-z0-9A-Z_.]+]]:
+; MEDIUM-VSX: .quad 4562098671269285104
+; MEDIUM-VSX-LABEL: test_double_const:
+; MEDIUM-VSX: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
+; MEDIUM-VSX: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
+; MEDIUM-VSX: lxsdx {{[0-9]+}}, 0, [[REG2]]
+
 ; LARGE: [[VAR:[a-z0-9A-Z_.]+]]:
 ; LARGE: .quad 4562098671269285104
 ; LARGE-LABEL: test_double_const:
 ; LARGE: addis [[REG1:[0-9]+]], 2, [[VAR2:[a-z0-9A-Z_.]+]]@toc@ha
 ; LARGE: ld [[REG2:[0-9]+]], [[VAR2]]@toc@l([[REG1]])
 ; LARGE: lfd {{[0-9]+}}, 0([[REG2]])
+
+; LARGE-VSX: [[VAR:[a-z0-9A-Z_.]+]]:
+; LARGE-VSX: .quad 4562098671269285104
+; LARGE-VSX-LABEL: test_double_const:
+; LARGE-VSX: addis [[REG1:[0-9]+]], 2, [[VAR2:[a-z0-9A-Z_.]+]]@toc@ha
+; LARGE-VSX: ld [[REG2:[0-9]+]], [[VAR2]]@toc@l([[REG1]])
+; LARGE-VSX: lxsdx {{[0-9]+}}, 0, [[REG2]]
diff --git a/test/CodeGen/PowerPC/ppc32-lshrti3.ll b/test/CodeGen/PowerPC/ppc32-lshrti3.ll
new file mode 100644
index 0000000..6e76fea
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc32-lshrti3.ll
@@ -0,0 +1,39 @@
+; RUN: llc -O=2 < %s -mtriple=powerpc-netbsd | FileCheck %s
+
+; CHECK-NOT: bl __lshrti3
+
+; ModuleID = 'lshrti3-ppc32.c'
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc--netbsd"
+
+; Function Attrs: nounwind uwtable
+define i32 @fn1() #0 {
+entry:
+  %.promoted = load i72* inttoptr (i32 1 to i72*), align 4
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %bf.set3 = phi i72 [ %bf.set, %while.cond ], [ %.promoted, %entry ]
+  %bf.lshr = lshr i72 %bf.set3, 40
+  %bf.lshr.tr = trunc i72 %bf.lshr to i32
+  %bf.cast = and i32 %bf.lshr.tr, 65535
+  %dec = add nsw i32 %bf.lshr.tr, 65535
+  %0 = zext i32 %dec to i72
+  %bf.value = shl nuw i72 %0, 40
+  %bf.shl = and i72 %bf.value, 72056494526300160
+  %bf.clear2 = and i72 %bf.set3, -72056494526300161
+  %bf.set = or i72 %bf.shl, %bf.clear2
+  %tobool = icmp eq i32 %bf.cast, 0
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  %bf.set.lcssa = phi i72 [ %bf.set, %while.cond ]
+  store i72 %bf.set.lcssa, i72* inttoptr (i32 1 to i72*), align 4
+  ret i32 undef
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 (213754)"}
diff --git a/test/CodeGen/PowerPC/ppc32-pic-large.ll b/test/CodeGen/PowerPC/ppc32-pic-large.ll
new file mode 100644
index 0000000..ecc4f10
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc32-pic-large.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -relocation-model=pic | FileCheck -check-prefix=LARGE-BSS %s
+@bar = common global i32 0, align 4
+
+define i32 @foo() {
+entry:
+  %0 = load i32* @bar, align 4
+  ret i32 %0
+}
+
+!llvm.module.flags = !{!0}
+!0 = metadata !{i32 1, metadata !"PIC Level", i32 2}
+; LARGE-BSS:       [[POFF:\.L[0-9]+\$poff]]:
+; LARGE-BSS-NEXT:    .long .LTOC-[[PB:\.L[0-9]+\$pb]]
+; LARGE-BSS-NEXT:  foo:
+; LARGE-BSS:         bl [[PB]]
+; LARGE-BSS-NEXT:  [[PB]]:
+; LARGE-BSS:         mflr 30
+; LARGE-BSS:         lwz [[REG:[0-9]+]], [[POFF]]-[[PB]](30)
+; LARGE-BSS-NEXT:    add 30, [[REG]], 30
+; LARGE-BSS:         lwz [[VREG:[0-9]+]], [[VREF:\.LC[0-9]+]]-.LTOC(30)
+; LARGE-BSS:         lwz {{[0-9]+}}, 0([[VREG]])
+; LARGE-BSS:       [[VREF]]:
+; LARGE-BSS-NEXT:    .long bar
diff --git a/test/CodeGen/PowerPC/ppc32-pic.ll b/test/CodeGen/PowerPC/ppc32-pic.ll
new file mode 100644
index 0000000..f9c3467
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc32-pic.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -relocation-model=pic | FileCheck -check-prefix=SMALL-BSS %s
+@bar = common global i32 0, align 4
+
+define i32 @foo() {
+entry:
+  %0 = load i32* @bar, align 4
+  ret i32 %0
+}
+
+!llvm.module.flags = !{!0}
+!0 = metadata !{i32 1, metadata !"PIC Level", i32 1}
+; SMALL-BSS-LABEL:foo:
+; SMALL-BSS:         bl _GLOBAL_OFFSET_TABLE_@local-4
+; SMALL-BSS:         mflr 30
+; SMALL-BSS:         lwz [[VREG:[0-9]+]], bar@GOT(30)
+; SMALL-BSS:         lwz {{[0-9]+}}, 0([[VREG]])
diff --git a/test/CodeGen/PowerPC/ppc440-msync.ll b/test/CodeGen/PowerPC/ppc440-msync.ll
index 1274173..3f4e7fd 100644
--- a/test/CodeGen/PowerPC/ppc440-msync.ll
+++ b/test/CodeGen/PowerPC/ppc440-msync.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=ppc32 | FileCheck %s
+; RUN: llc < %s -march=ppc64 -mcpu=a2 | FileCheck %s
 ; RUN: llc < %s -march=ppc32 -mcpu=440 | FileCheck %s -check-prefix=BE-CHK
 
 define i32 @has_a_fence(i32 %a, i32 %b) nounwind {
diff --git a/test/CodeGen/PowerPC/ppc64-align-long-double.ll b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
index 764d3ce..5ed029c 100644
--- a/test/CodeGen/PowerPC/ppc64-align-long-double.ll
+++ b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mcpu=pwr7 -O0 -fast-isel=false < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -fast-isel=false -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -fast-isel=false -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-VSX %s
 
 ; Verify internal alignment of long double in a struct.  The double
 ; argument comes in in GPR3; GPR4 is skipped; GPRs 5 and 6 contain
@@ -24,3 +25,12 @@ entry:
 ; CHECK: lfd 1, 64(1)
 ; CHECK: lfd 2, 72(1)
 
+; CHECK-VSX: std 6, 72(1)
+; CHECK-VSX: std 5, 64(1)
+; CHECK-VSX: std 4, 56(1)
+; CHECK-VSX: std 3, 48(1)
+; CHECK-VSX: li 3, 16
+; CHECK-VSX: addi 4, 1, 48
+; CHECK-VSX: lxsdx 1, 4, 3
+; CHECK-VSX: li 3, 24
+; CHECK-VSX: lxsdx 2, 4, 3
diff --git a/test/CodeGen/PowerPC/ppc64-elf-abi.ll b/test/CodeGen/PowerPC/ppc64-elf-abi.ll
new file mode 100644
index 0000000..d82122d
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-elf-abi.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=CHECK-ELFv1
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mattr=+elfv1 < %s | FileCheck %s -check-prefix=CHECK-ELFv1
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mattr=+elfv2 < %s | FileCheck %s -check-prefix=CHECK-ELFv2
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefix=CHECK-ELFv2
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mattr=+elfv1 < %s | FileCheck %s -check-prefix=CHECK-ELFv1
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mattr=+elfv2 < %s | FileCheck %s -check-prefix=CHECK-ELFv2
+
+; CHECK-ELFv2: .abiversion 2
+; CHECK-ELFv1-NOT: .abiversion 2
+
diff --git a/test/CodeGen/PowerPC/ppc64-prefetch.ll b/test/CodeGen/PowerPC/ppc64-prefetch.ll
index b2f3709..b2f6e7d 100644
--- a/test/CodeGen/PowerPC/ppc64-prefetch.ll
+++ b/test/CodeGen/PowerPC/ppc64-prefetch.ll
@@ -1,15 +1,34 @@
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -mcpu=a2 < %s | FileCheck %s
 
 define void @test1(i8* %a, ...) nounwind {
 entry:
   call void @llvm.prefetch(i8* %a, i32 0, i32 3, i32 1)
   ret void
+
+; CHECK-LABEL: @test1
+; CHECK: dcbt
 }
 
 declare void @llvm.prefetch(i8*, i32, i32, i32)
 
-; CHECK: @test1
-; CHECK: dcbt
+define void @test2(i8* %a, ...) nounwind {
+entry:
+  call void @llvm.prefetch(i8* %a, i32 1, i32 3, i32 1)
+  ret void
+
+; CHECK-LABEL: @test2
+; CHECK: dcbtst
+}
+
+define void @test3(i8* %a, ...) nounwind {
+entry:
+  call void @llvm.prefetch(i8* %a, i32 0, i32 3, i32 0)
+  ret void
+
+; CHECK-LABEL: @test3
+; CHECK: icbt
+}
+
 
diff --git a/test/CodeGen/PowerPC/ppc64le-aggregates.ll b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
new file mode 100644
index 0000000..9eed623
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
@@ -0,0 +1,329 @@
+; RUN: llc < %s -march=ppc64le -mcpu=pwr8 -mattr=+altivec | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+;
+; Verify use of registers for float/vector aggregate return.
+;
+
+define [8 x float] @return_float([8 x float] %x) {
+entry:
+  ret [8 x float] %x
+}
+; CHECK-LABEL: @return_float
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define [8 x double] @return_double([8 x double] %x) {
+entry:
+  ret [8 x double] %x
+}
+; CHECK-LABEL: @return_double
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define [4 x ppc_fp128] @return_ppcf128([4 x ppc_fp128] %x) {
+entry:
+  ret [4 x ppc_fp128] %x
+}
+; CHECK-LABEL: @return_ppcf128
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define [8 x <4 x i32>] @return_v4i32([8 x <4 x i32>] %x) {
+entry:
+  ret [8 x <4 x i32>] %x
+}
+; CHECK-LABEL: @return_v4i32
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+
+;
+; Verify amount of space taken up by aggregates in the parameter save area.
+;
+
+define i64 @callee_float([7 x float] %a, [7 x float] %b, i64 %c) {
+entry:
+  ret i64 %c
+}
+; CHECK-LABEL: @callee_float
+; CHECK: ld 3, 96(1)
+; CHECK: blr
+
+define void @caller_float(i64 %x, [7 x float] %y) {
+entry:
+  tail call void @test_float([7 x float] %y, [7 x float] %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: @caller_float
+; CHECK: std 3, 96(1)
+; CHECK: bl test_float
+
+declare void @test_float([7 x float], [7 x float], i64)
+
+define i64 @callee_double(i64 %a, [7 x double] %b, i64 %c) {
+entry:
+  ret i64 %c
+}
+; CHECK-LABEL: @callee_double
+; CHECK: ld 3, 96(1)
+; CHECK: blr
+
+define void @caller_double(i64 %x, [7 x double] %y) {
+entry:
+  tail call void @test_double(i64 %x, [7 x double] %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: @caller_double
+; CHECK: std 3, 96(1)
+; CHECK: bl test_double
+
+declare void @test_double(i64, [7 x double], i64)
+
+define i64 @callee_ppcf128(i64 %a, [4 x ppc_fp128] %b, i64 %c) {
+entry:
+  ret i64 %c
+}
+; CHECK-LABEL: @callee_ppcf128
+; CHECK: ld 3, 104(1)
+; CHECK: blr
+
+define void @caller_ppcf128(i64 %x, [4 x ppc_fp128] %y) {
+entry:
+  tail call void @test_ppcf128(i64 %x, [4 x ppc_fp128] %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: @caller_ppcf128
+; CHECK: std 3, 104(1)
+; CHECK: bl test_ppcf128
+
+declare void @test_ppcf128(i64, [4 x ppc_fp128], i64)
+
+define i64 @callee_i64(i64 %a, [7 x i64] %b, i64 %c) {
+entry:
+  ret i64 %c
+}
+; CHECK-LABEL: @callee_i64
+; CHECK: ld 3, 96(1)
+; CHECK: blr
+
+define void @caller_i64(i64 %x, [7 x i64] %y) {
+entry:
+  tail call void @test_i64(i64 %x, [7 x i64] %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: @caller_i64
+; CHECK: std 3, 96(1)
+; CHECK: bl test_i64
+
+declare void @test_i64(i64, [7 x i64], i64)
+
+define i64 @callee_i128(i64 %a, [4 x i128] %b, i64 %c) {
+entry:
+  ret i64 %c
+}
+; CHECK-LABEL: @callee_i128
+; CHECK: ld 3, 112(1)
+; CHECK: blr
+
+define void @caller_i128(i64 %x, [4 x i128] %y) {
+entry:
+  tail call void @test_i128(i64 %x, [4 x i128] %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: @caller_i128
+; CHECK: std 3, 112(1)
+; CHECK: bl test_i128
+
+declare void @test_i128(i64, [4 x i128], i64)
+
+define i64 @callee_v4i32(i64 %a, [4 x <4 x i32>] %b, i64 %c) {
+entry:
+  ret i64 %c
+}
+; CHECK-LABEL: @callee_v4i32
+; CHECK: ld 3, 112(1)
+; CHECK: blr
+
+define void @caller_v4i32(i64 %x, [4 x <4 x i32>] %y) {
+entry:
+  tail call void @test_v4i32(i64 %x, [4 x <4 x i32>] %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: @caller_v4i32
+; CHECK: std 3, 112(1)
+; CHECK: bl test_v4i32
+
+declare void @test_v4i32(i64, [4 x <4 x i32>], i64)
+
+
+;
+; Verify handling of floating point arguments in GPRs
+;
+
+%struct.float8 = type { [8 x float] }
+%struct.float5 = type { [5 x float] }
+%struct.float2 = type { [2 x float] }
+
+@g8 = common global %struct.float8 zeroinitializer, align 4
+@g5 = common global %struct.float5 zeroinitializer, align 4
+@g2 = common global %struct.float2 zeroinitializer, align 4
+
+define float @callee0([7 x float] %a, [7 x float] %b) {
+entry:
+  %b.extract = extractvalue [7 x float] %b, 6
+  ret float %b.extract
+}
+; CHECK-LABEL: @callee0
+; CHECK: stw 10, [[OFF:.*]](1)
+; CHECK: lfs 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller0([7 x float] %a) {
+entry:
+  tail call void @test0([7 x float] %a, [7 x float] %a)
+  ret void
+}
+; CHECK-LABEL: @caller0
+; CHECK-DAG: fmr 8, 1
+; CHECK-DAG: fmr 9, 2
+; CHECK-DAG: fmr 10, 3
+; CHECK-DAG: fmr 11, 4
+; CHECK-DAG: fmr 12, 5
+; CHECK-DAG: fmr 13, 6
+; CHECK-DAG: stfs 7, [[OFF:[0-9]+]](1)
+; CHECK-DAG: lwz 10, [[OFF]](1)
+; CHECK: bl test0
+
+declare void @test0([7 x float], [7 x float])
+
+define float @callee1([8 x float] %a, [8 x float] %b) {
+entry:
+  %b.extract = extractvalue [8 x float] %b, 7
+  ret float %b.extract
+}
+; CHECK-LABEL: @callee1
+; CHECK: rldicl [[REG:[0-9]+]], 10, 32, 32
+; CHECK: stw [[REG]], [[OFF:.*]](1)
+; CHECK: lfs 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller1([8 x float] %a) {
+entry:
+  tail call void @test1([8 x float] %a, [8 x float] %a)
+  ret void
+}
+; CHECK-LABEL: @caller1
+; CHECK-DAG: fmr 9, 1
+; CHECK-DAG: fmr 10, 2
+; CHECK-DAG: fmr 11, 3
+; CHECK-DAG: fmr 12, 4
+; CHECK-DAG: fmr 13, 5
+; CHECK-DAG: stfs 5, [[OFF0:[0-9]+]](1)
+; CHECK-DAG: stfs 6, [[OFF1:[0-9]+]](1)
+; CHECK-DAG: stfs 7, [[OFF2:[0-9]+]](1)
+; CHECK-DAG: stfs 8, [[OFF3:[0-9]+]](1)
+; CHECK-DAG: lwz [[REG0:[0-9]+]], [[OFF0]](1)
+; CHECK-DAG: lwz [[REG1:[0-9]+]], [[OFF1]](1)
+; CHECK-DAG: lwz [[REG2:[0-9]+]], [[OFF2]](1)
+; CHECK-DAG: lwz [[REG3:[0-9]+]], [[OFF3]](1)
+; CHECK-DAG: sldi [[REG1]], [[REG1]], 32
+; CHECK-DAG: sldi [[REG3]], [[REG3]], 32
+; CHECK-DAG: or 9, [[REG0]], [[REG1]]
+; CHECK-DAG: or 10, [[REG2]], [[REG3]]
+; CHECK: bl test1
+
+declare void @test1([8 x float], [8 x float])
+
+define float @callee2([8 x float] %a, [5 x float] %b, [2 x float] %c) {
+entry:
+  %c.extract = extractvalue [2 x float] %c, 1
+  ret float %c.extract
+}
+; CHECK-LABEL: @callee2
+; CHECK: rldicl [[REG:[0-9]+]], 10, 32, 32
+; CHECK: stw [[REG]], [[OFF:.*]](1)
+; CHECK: lfs 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller2() {
+entry:
+  %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4
+  %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4
+  %2 = load [2 x float]* getelementptr inbounds (%struct.float2* @g2, i64 0, i32 0), align 4
+  tail call void @test2([8 x float] %0, [5 x float] %1, [2 x float] %2)
+  ret void
+}
+; CHECK-LABEL: @caller2
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK-DAG: lfs 1, 0([[REG]])
+; CHECK-DAG: lfs 2, 4([[REG]])
+; CHECK-DAG: lfs 3, 8([[REG]])
+; CHECK-DAG: lfs 4, 12([[REG]])
+; CHECK-DAG: lfs 5, 16([[REG]])
+; CHECK-DAG: lfs 6, 20([[REG]])
+; CHECK-DAG: lfs 7, 24([[REG]])
+; CHECK-DAG: lfs 8, 28([[REG]])
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK-DAG: lfs 9, 0([[REG]])
+; CHECK-DAG: lfs 10, 4([[REG]])
+; CHECK-DAG: lfs 11, 8([[REG]])
+; CHECK-DAG: lfs 12, 12([[REG]])
+; CHECK-DAG: lfs 13, 16([[REG]])
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK-DAG: lwz [[REG0:[0-9]+]], 0([[REG]])
+; CHECK-DAG: lwz [[REG1:[0-9]+]], 4([[REG]])
+; CHECK-DAG: sldi [[REG1]], [[REG1]], 32
+; CHECK-DAG: or 10, [[REG0]], [[REG1]]
+; CHECK: bl test2
+
+declare void @test2([8 x float], [5 x float], [2 x float])
+
+define double @callee3([8 x float] %a, [5 x float] %b, double %c) {
+entry:
+  ret double %c
+}
+; CHECK-LABEL: @callee3
+; CHECK: std 10, [[OFF:.*]](1)
+; CHECK: lfd 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller3(double %d) {
+entry:
+  %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4
+  %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4
+  tail call void @test3([8 x float] %0, [5 x float] %1, double %d)
+  ret void
+}
+; CHECK-LABEL: @caller3
+; CHECK: stfd 1, [[OFF:.*]](1)
+; CHECK: ld 10, [[OFF]](1)
+; CHECK: bl test3
+
+declare void @test3([8 x float], [5 x float], double)
+
+define float @callee4([8 x float] %a, [5 x float] %b, float %c) {
+entry:
+  ret float %c
+}
+; CHECK-LABEL: @callee4
+; CHECK: stw 10, [[OFF:.*]](1)
+; CHECK: lfs 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller4(float %f) {
+entry:
+  %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4
+  %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4
+  tail call void @test4([8 x float] %0, [5 x float] %1, float %f)
+  ret void
+}
+; CHECK-LABEL: @caller4
+; CHECK: stfs 1, [[OFF:.*]](1)
+; CHECK: lwz 10, [[OFF]](1)
+; CHECK: bl test4
+
+declare void @test4([8 x float], [5 x float], float)
+
diff --git a/test/CodeGen/PowerPC/ppc64le-calls.ll b/test/CodeGen/PowerPC/ppc64le-calls.ll
new file mode 100644
index 0000000..0d667dd
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64le-calls.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=ppc64le -mcpu=pwr8 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Indirect calls requires a full stub creation
+define void @test_indirect(void ()* nocapture %fp) {
+; CHECK-LABEL: @test_indirect
+  tail call void %fp()
+; CHECK-DAG: std 2, 24(1)
+; CHECK-DAG: mr 12, 3
+; CHECK-DAG: mtctr 3
+; CHECK: bctrl
+; CHECK-NEXT: ld 2, 24(1)
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64le-crsave.ll b/test/CodeGen/PowerPC/ppc64le-crsave.ll
new file mode 100644
index 0000000..17174d7
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64le-crsave.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@_ZTIi = external constant i8*
+declare i8* @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+define void @crsave() {
+entry:
+  call void asm sideeffect "", "~{cr2}"()
+  call void asm sideeffect "", "~{cr3}"()
+  call void asm sideeffect "", "~{cr4}"()
+
+  %exception = call i8* @__cxa_allocate_exception(i64 4)
+  %0 = bitcast i8* %exception to i32*
+  store i32 0, i32* %0
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+
+return:                                           ; No predecessors!
+  ret void
+}
+; CHECK-LABEL: @crsave
+; CHECK: .cfi_offset cr2, 8
+; CHECK: .cfi_offset cr3, 8
+; CHECK: .cfi_offset cr4, 8
+
diff --git a/test/CodeGen/PowerPC/ppc64le-localentry.ll b/test/CodeGen/PowerPC/ppc64le-localentry.ll
new file mode 100644
index 0000000..4676ce8
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64le-localentry.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=ppc64le -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -march=ppc64le -mcpu=pwr8 -O0 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@number64 = global i64 10, align 8
+
+; CHECK: .abiversion 2
+
+define i64 @use_toc(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: @use_toc
+; CHECK-NEXT: .Ltmp[[TMP1:[0-9]+]]:
+; CHECK-NEXT: addis 2, 12, .TOC.-.Ltmp[[TMP1]]@ha
+; CHECK-NEXT: addi 2, 2, .TOC.-.Ltmp[[TMP1]]@l
+; CHECK-NEXT: .Ltmp[[TMP2:[0-9]+]]:
+; CHECK-NEXT: .localentry use_toc, .Ltmp[[TMP2]]-.Ltmp[[TMP1]]
+; CHECK-NEXT: %entry
+  %0 = load i64* @number64, align 8
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+declare void @callee()
+define void @use_toc_implicit() nounwind {
+entry:
+; CHECK-LABEL: @use_toc_implicit
+; CHECK-NEXT: .Ltmp[[TMP1:[0-9]+]]:
+; CHECK-NEXT: addis 2, 12, .TOC.-.Ltmp[[TMP1]]@ha
+; CHECK-NEXT: addi 2, 2, .TOC.-.Ltmp[[TMP1]]@l
+; CHECK-NEXT: .Ltmp[[TMP2:[0-9]+]]:
+; CHECK-NEXT: .localentry use_toc_implicit, .Ltmp[[TMP2]]-.Ltmp[[TMP1]]
+; CHECK-NEXT: %entry
+  call void @callee()
+  ret void
+}
+
+define i64 @no_toc(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: @no_toc
+; CHECK-NEXT: %entry
+  ret i64 %a
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64le-smallarg.ll b/test/CodeGen/PowerPC/ppc64le-smallarg.ll
index fcb1e92..120c140 100644
--- a/test/CodeGen/PowerPC/ppc64le-smallarg.ll
+++ b/test/CodeGen/PowerPC/ppc64le-smallarg.ll
@@ -22,7 +22,7 @@ entry:
   ret void
 }
 ; CHECK: @callee1
-; CHECK: lwz {{[0-9]+}}, 120(1)
+; CHECK: lwz {{[0-9]+}}, 104(1)
 ; CHECK: blr
 
 define void @caller1() {
@@ -32,7 +32,7 @@ entry:
   ret void
 }
 ; CHECK: @caller1
-; CHECK: stw {{[0-9]+}}, 120(1)
+; CHECK: stw {{[0-9]+}}, 104(1)
 ; CHECK: bl test1
 
 declare void @test1(%struct.small_arg* sret, %struct.large_arg* byval, %struct.small_arg* byval)
@@ -42,7 +42,7 @@ entry:
   ret float %x
 }
 ; CHECK: @callee2
-; CHECK: lfs {{[0-9]+}}, 152(1)
+; CHECK: lfs {{[0-9]+}}, 136(1)
 ; CHECK: blr
 
 define void @caller2() {
@@ -52,7 +52,7 @@ entry:
   ret void
 }
 ; CHECK: @caller2
-; CHECK: stfs {{[0-9]+}}, 152(1)
+; CHECK: stfs {{[0-9]+}}, 136(1)
 ; CHECK: bl test2
 
 declare float @test2(float, float, float, float, float, float, float, float, float, float, float, float, float, float)
diff --git a/test/CodeGen/PowerPC/ppcf128-1.ll b/test/CodeGen/PowerPC/ppcf128-1.ll
index 1047fe5..2cec934 100644
--- a/test/CodeGen/PowerPC/ppcf128-1.ll
+++ b/test/CodeGen/PowerPC/ppcf128-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts | llc > %t
+; RUN: opt < %s -O3 | llc > %t
 ; ModuleID = 'ld3.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin8"
diff --git a/test/CodeGen/PowerPC/pr15630.ll b/test/CodeGen/PowerPC/pr15630.ll
index c5ba8a4..3c1b604 100644
--- a/test/CodeGen/PowerPC/pr15630.ll
+++ b/test/CodeGen/PowerPC/pr15630.ll
@@ -13,4 +13,5 @@ entry:
   ret void
 }
 
-; CHECK: stwcx.
+; CHECK: sync
+; CHECK: stb
diff --git a/test/CodeGen/PowerPC/pr17168.ll b/test/CodeGen/PowerPC/pr17168.ll
index 24bcda0..c3f0162 100644
--- a/test/CodeGen/PowerPC/pr17168.ll
+++ b/test/CodeGen/PowerPC/pr17168.ll
@@ -25,7 +25,7 @@ for.cond968.preheader:                            ; preds = %for.cond968.prehead
 for.end1042:                                      ; preds = %for.cond968.preheader, %for.cond964.preheader, %entry
   %0 = phi i32 [ undef, %for.cond964.preheader ], [ undef, %for.cond968.preheader ], [ undef, %entry ]
   %1 = load i32* getelementptr inbounds ([3 x i32]* @grid_points, i64 0, i64 0), align 4, !dbg !443, !tbaa !444
-  tail call void @llvm.dbg.value(metadata !447, i64 0, metadata !119), !dbg !448
+  tail call void @llvm.dbg.value(metadata !447, i64 0, metadata !119, metadata !{metadata !"0x102"}), !dbg !448
   %sub10454270 = add nsw i32 %0, -1, !dbg !448
   %cmp10464271 = icmp sgt i32 %sub10454270, 1, !dbg !448
   %sub11134263 = add nsw i32 %1, -1, !dbg !450
@@ -46,7 +46,7 @@ for.cond1816.preheader.for.inc1898_crit_edge:     ; preds = %for.cond1816.prehea
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -54,468 +54,468 @@ attributes #1 = { nounwind readnone }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!438, !464}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 190311)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !298, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 190311)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !298, metadata !2} ; [ DW_TAG_compile_unit ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"bt.c", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !82, metadata !102, metadata !114, metadata !132, metadata !145, metadata !154, metadata !155, metadata !162, metadata !183, metadata !200, metadata !201, metadata !207, metadata !208, metadata !215, metadata !221, metadata !230, metadata !238, metadata !246, metadata !255, metadata !260, metadata !261, metadata !268, metadata !274, metadata !279, metadata !280, metadata !287, metadata !293}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 74, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !12, i32 74} ; [ DW_TAG_subprogram ] [line 74] [def] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\0074\000\001\000\006\00256\001\0074", metadata !1, metadata !5, metadata !6, null, null, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 74] [def] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8, metadata !9}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!11 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_unsigned_char]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!11 = metadata !{metadata !"0x24\00char\000\008\008\000\000\008", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_unsigned_char]
 !12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !16, metadata !17, metadata !18, metadata !19, metadata !21, metadata !22, metadata !23, metadata !25, metadata !26}
-!13 = metadata !{i32 786689, metadata !4, metadata !"argc", metadata !5, i32 16777290, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 74]
-!14 = metadata !{i32 786689, metadata !4, metadata !"argv", metadata !5, i32 33554506, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 74]
-!15 = metadata !{i32 786688, metadata !4, metadata !"niter", metadata !5, i32 76, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [niter] [line 76]
-!16 = metadata !{i32 786688, metadata !4, metadata !"step", metadata !5, i32 76, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [step] [line 76]
-!17 = metadata !{i32 786688, metadata !4, metadata !"n3", metadata !5, i32 76, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n3] [line 76]
-!18 = metadata !{i32 786688, metadata !4, metadata !"nthreads", metadata !5, i32 77, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [nthreads] [line 77]
-!19 = metadata !{i32 786688, metadata !4, metadata !"navg", metadata !5, i32 78, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [navg] [line 78]
-!20 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!21 = metadata !{i32 786688, metadata !4, metadata !"mflops", metadata !5, i32 78, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [mflops] [line 78]
-!22 = metadata !{i32 786688, metadata !4, metadata !"tmax", metadata !5, i32 80, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [tmax] [line 80]
-!23 = metadata !{i32 786688, metadata !4, metadata !"verified", metadata !5, i32 81, metadata !24, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [verified] [line 81]
-!24 = metadata !{i32 786454, metadata !1, null, metadata !"boolean", i32 12, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ] [boolean] [line 12, size 0, align 0, offset 0] [from int]
-!25 = metadata !{i32 786688, metadata !4, metadata !"class", metadata !5, i32 82, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [class] [line 82]
-!26 = metadata !{i32 786688, metadata !4, metadata !"fp", metadata !5, i32 83, metadata !27, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [fp] [line 83]
-!27 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from FILE]
-!28 = metadata !{i32 786454, metadata !1, null, metadata !"FILE", i32 49, i64 0, i64 0, i64 0, i32 0, metadata !29} ; [ DW_TAG_typedef ] [FILE] [line 49, size 0, align 0, offset 0] [from _IO_FILE]
-!29 = metadata !{i32 786451, metadata !30, null, metadata !"_IO_FILE", i32 271, i64 1728, i64 64, i32 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [_IO_FILE] [line 271, size 1728, align 64, offset 0] [def] [from ]
+!13 = metadata !{metadata !"0x101\00argc\0016777290\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [argc] [line 74]
+!14 = metadata !{metadata !"0x101\00argv\0033554506\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [argv] [line 74]
+!15 = metadata !{metadata !"0x100\00niter\0076\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [niter] [line 76]
+!16 = metadata !{metadata !"0x100\00step\0076\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [step] [line 76]
+!17 = metadata !{metadata !"0x100\00n3\0076\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n3] [line 76]
+!18 = metadata !{metadata !"0x100\00nthreads\0077\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [nthreads] [line 77]
+!19 = metadata !{metadata !"0x100\00navg\0078\000", metadata !4, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [navg] [line 78]
+!20 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!21 = metadata !{metadata !"0x100\00mflops\0078\000", metadata !4, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [mflops] [line 78]
+!22 = metadata !{metadata !"0x100\00tmax\0080\000", metadata !4, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [tmax] [line 80]
+!23 = metadata !{metadata !"0x100\00verified\0081\000", metadata !4, metadata !5, metadata !24} ; [ DW_TAG_auto_variable ] [verified] [line 81]
+!24 = metadata !{metadata !"0x16\00boolean\0012\000\000\000\000", metadata !1, null, metadata !8} ; [ DW_TAG_typedef ] [boolean] [line 12, size 0, align 0, offset 0] [from int]
+!25 = metadata !{metadata !"0x100\00class\0082\000", metadata !4, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [class] [line 82]
+!26 = metadata !{metadata !"0x100\00fp\0083\000", metadata !4, metadata !5, metadata !27} ; [ DW_TAG_auto_variable ] [fp] [line 83]
+!27 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from FILE]
+!28 = metadata !{metadata !"0x16\00FILE\0049\000\000\000\000", metadata !1, null, metadata !29} ; [ DW_TAG_typedef ] [FILE] [line 49, size 0, align 0, offset 0] [from _IO_FILE]
+!29 = metadata !{metadata !"0x13\00_IO_FILE\00271\001728\0064\000\000\000", metadata !30, null, null, metadata !31, null, null, null} ; [ DW_TAG_structure_type ] [_IO_FILE] [line 271, size 1728, align 64, offset 0] [def] [from ]
 !30 = metadata !{metadata !"/usr/include/libio.h", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
 !31 = metadata !{metadata !32, metadata !33, metadata !34, metadata !35, metadata !36, metadata !37, metadata !38, metadata !39, metadata !40, metadata !41, metadata !42, metadata !43, metadata !44, metadata !52, metadata !53, metadata !54, metadata !55, metadata !58, metadata !60, metadata !62, metadata !66, metadata !68, metadata !70, metadata !71, metadata !72, metadata !73, metadata !74, metadata !77, metadata !78}
-!32 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_flags", i32 272, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [_flags] [line 272, size 32, align 32, offset 0] [from int]
-!33 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_read_ptr", i32 277, i64 64, i64 64, i64 64, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_read_ptr] [line 277, size 64, align 64, offset 64] [from ]
-!34 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_read_end", i32 278, i64 64, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_read_end] [line 278, size 64, align 64, offset 128] [from ]
-!35 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_read_base", i32 279, i64 64, i64 64, i64 192, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_read_base] [line 279, size 64, align 64, offset 192] [from ]
-!36 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_write_base", i32 280, i64 64, i64 64, i64 256, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_write_base] [line 280, size 64, align 64, offset 256] [from ]
-!37 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_write_ptr", i32 281, i64 64, i64 64, i64 320, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_write_ptr] [line 281, size 64, align 64, offset 320] [from ]
-!38 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_write_end", i32 282, i64 64, i64 64, i64 384, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_write_end] [line 282, size 64, align 64, offset 384] [from ]
-!39 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_buf_base", i32 283, i64 64, i64 64, i64 448, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_buf_base] [line 283, size 64, align 64, offset 448] [from ]
-!40 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_buf_end", i32 284, i64 64, i64 64, i64 512, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_buf_end] [line 284, size 64, align 64, offset 512] [from ]
-!41 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_save_base", i32 286, i64 64, i64 64, i64 576, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_save_base] [line 286, size 64, align 64, offset 576] [from ]
-!42 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_backup_base", i32 287, i64 64, i64 64, i64 640, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_backup_base] [line 287, size 64, align 64, offset 640] [from ]
-!43 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_save_end", i32 288, i64 64, i64 64, i64 704, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_save_end] [line 288, size 64, align 64, offset 704] [from ]
-!44 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_markers", i32 290, i64 64, i64 64, i64 768, i32 0, metadata !45} ; [ DW_TAG_member ] [_markers] [line 290, size 64, align 64, offset 768] [from ]
-!45 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !46} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_marker]
-!46 = metadata !{i32 786451, metadata !30, null, metadata !"_IO_marker", i32 186, i64 192, i64 64, i32 0, i32 0, null, metadata !47, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [_IO_marker] [line 186, size 192, align 64, offset 0] [def] [from ]
+!32 = metadata !{metadata !"0xd\00_flags\00272\0032\0032\000\000", metadata !30, metadata !29, metadata !8} ; [ DW_TAG_member ] [_flags] [line 272, size 32, align 32, offset 0] [from int]
+!33 = metadata !{metadata !"0xd\00_IO_read_ptr\00277\0064\0064\0064\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_read_ptr] [line 277, size 64, align 64, offset 64] [from ]
+!34 = metadata !{metadata !"0xd\00_IO_read_end\00278\0064\0064\00128\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_read_end] [line 278, size 64, align 64, offset 128] [from ]
+!35 = metadata !{metadata !"0xd\00_IO_read_base\00279\0064\0064\00192\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_read_base] [line 279, size 64, align 64, offset 192] [from ]
+!36 = metadata !{metadata !"0xd\00_IO_write_base\00280\0064\0064\00256\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_write_base] [line 280, size 64, align 64, offset 256] [from ]
+!37 = metadata !{metadata !"0xd\00_IO_write_ptr\00281\0064\0064\00320\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_write_ptr] [line 281, size 64, align 64, offset 320] [from ]
+!38 = metadata !{metadata !"0xd\00_IO_write_end\00282\0064\0064\00384\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_write_end] [line 282, size 64, align 64, offset 384] [from ]
+!39 = metadata !{metadata !"0xd\00_IO_buf_base\00283\0064\0064\00448\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_buf_base] [line 283, size 64, align 64, offset 448] [from ]
+!40 = metadata !{metadata !"0xd\00_IO_buf_end\00284\0064\0064\00512\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_buf_end] [line 284, size 64, align 64, offset 512] [from ]
+!41 = metadata !{metadata !"0xd\00_IO_save_base\00286\0064\0064\00576\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_save_base] [line 286, size 64, align 64, offset 576] [from ]
+!42 = metadata !{metadata !"0xd\00_IO_backup_base\00287\0064\0064\00640\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_backup_base] [line 287, size 64, align 64, offset 640] [from ]
+!43 = metadata !{metadata !"0xd\00_IO_save_end\00288\0064\0064\00704\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_save_end] [line 288, size 64, align 64, offset 704] [from ]
+!44 = metadata !{metadata !"0xd\00_markers\00290\0064\0064\00768\000", metadata !30, metadata !29, metadata !45} ; [ DW_TAG_member ] [_markers] [line 290, size 64, align 64, offset 768] [from ]
+!45 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !46} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_marker]
+!46 = metadata !{metadata !"0x13\00_IO_marker\00186\00192\0064\000\000\000", metadata !30, null, null, metadata !47, null, null, null} ; [ DW_TAG_structure_type ] [_IO_marker] [line 186, size 192, align 64, offset 0] [def] [from ]
 !47 = metadata !{metadata !48, metadata !49, metadata !51}
-!48 = metadata !{i32 786445, metadata !30, metadata !46, metadata !"_next", i32 187, i64 64, i64 64, i64 0, i32 0, metadata !45} ; [ DW_TAG_member ] [_next] [line 187, size 64, align 64, offset 0] [from ]
-!49 = metadata !{i32 786445, metadata !30, metadata !46, metadata !"_sbuf", i32 188, i64 64, i64 64, i64 64, i32 0, metadata !50} ; [ DW_TAG_member ] [_sbuf] [line 188, size 64, align 64, offset 64] [from ]
-!50 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_FILE]
-!51 = metadata !{i32 786445, metadata !30, metadata !46, metadata !"_pos", i32 192, i64 32, i64 32, i64 128, i32 0, metadata !8} ; [ DW_TAG_member ] [_pos] [line 192, size 32, align 32, offset 128] [from int]
-!52 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_chain", i32 292, i64 64, i64 64, i64 832, i32 0, metadata !50} ; [ DW_TAG_member ] [_chain] [line 292, size 64, align 64, offset 832] [from ]
-!53 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_fileno", i32 294, i64 32, i64 32, i64 896, i32 0, metadata !8} ; [ DW_TAG_member ] [_fileno] [line 294, size 32, align 32, offset 896] [from int]
-!54 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_flags2", i32 298, i64 32, i64 32, i64 928, i32 0, metadata !8} ; [ DW_TAG_member ] [_flags2] [line 298, size 32, align 32, offset 928] [from int]
-!55 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_old_offset", i32 300, i64 64, i64 64, i64 960, i32 0, metadata !56} ; [ DW_TAG_member ] [_old_offset] [line 300, size 64, align 64, offset 960] [from __off_t]
-!56 = metadata !{i32 786454, metadata !30, null, metadata !"__off_t", i32 141, i64 0, i64 0, i64 0, i32 0, metadata !57} ; [ DW_TAG_typedef ] [__off_t] [line 141, size 0, align 0, offset 0] [from long int]
-!57 = metadata !{i32 786468, null, null, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
-!58 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_cur_column", i32 304, i64 16, i64 16, i64 1024, i32 0, metadata !59} ; [ DW_TAG_member ] [_cur_column] [line 304, size 16, align 16, offset 1024] [from unsigned short]
-!59 = metadata !{i32 786468, null, null, metadata !"unsigned short", i32 0, i64 16, i64 16, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
-!60 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_vtable_offset", i32 305, i64 8, i64 8, i64 1040, i32 0, metadata !61} ; [ DW_TAG_member ] [_vtable_offset] [line 305, size 8, align 8, offset 1040] [from signed char]
-!61 = metadata !{i32 786468, null, null, metadata !"signed char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!62 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_shortbuf", i32 306, i64 8, i64 8, i64 1048, i32 0, metadata !63} ; [ DW_TAG_member ] [_shortbuf] [line 306, size 8, align 8, offset 1048] [from ]
-!63 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 8, i64 8, i32 0, i32 0, metadata !11, metadata !64, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
+!48 = metadata !{metadata !"0xd\00_next\00187\0064\0064\000\000", metadata !30, metadata !46, metadata !45} ; [ DW_TAG_member ] [_next] [line 187, size 64, align 64, offset 0] [from ]
+!49 = metadata !{metadata !"0xd\00_sbuf\00188\0064\0064\0064\000", metadata !30, metadata !46, metadata !50} ; [ DW_TAG_member ] [_sbuf] [line 188, size 64, align 64, offset 64] [from ]
+!50 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_FILE]
+!51 = metadata !{metadata !"0xd\00_pos\00192\0032\0032\00128\000", metadata !30, metadata !46, metadata !8} ; [ DW_TAG_member ] [_pos] [line 192, size 32, align 32, offset 128] [from int]
+!52 = metadata !{metadata !"0xd\00_chain\00292\0064\0064\00832\000", metadata !30, metadata !29, metadata !50} ; [ DW_TAG_member ] [_chain] [line 292, size 64, align 64, offset 832] [from ]
+!53 = metadata !{metadata !"0xd\00_fileno\00294\0032\0032\00896\000", metadata !30, metadata !29, metadata !8} ; [ DW_TAG_member ] [_fileno] [line 294, size 32, align 32, offset 896] [from int]
+!54 = metadata !{metadata !"0xd\00_flags2\00298\0032\0032\00928\000", metadata !30, metadata !29, metadata !8} ; [ DW_TAG_member ] [_flags2] [line 298, size 32, align 32, offset 928] [from int]
+!55 = metadata !{metadata !"0xd\00_old_offset\00300\0064\0064\00960\000", metadata !30, metadata !29, metadata !56} ; [ DW_TAG_member ] [_old_offset] [line 300, size 64, align 64, offset 960] [from __off_t]
+!56 = metadata !{metadata !"0x16\00__off_t\00141\000\000\000\000", metadata !30, null, metadata !57} ; [ DW_TAG_typedef ] [__off_t] [line 141, size 0, align 0, offset 0] [from long int]
+!57 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!58 = metadata !{metadata !"0xd\00_cur_column\00304\0016\0016\001024\000", metadata !30, metadata !29, metadata !59} ; [ DW_TAG_member ] [_cur_column] [line 304, size 16, align 16, offset 1024] [from unsigned short]
+!59 = metadata !{metadata !"0x24\00unsigned short\000\0016\0016\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
+!60 = metadata !{metadata !"0xd\00_vtable_offset\00305\008\008\001040\000", metadata !30, metadata !29, metadata !61} ; [ DW_TAG_member ] [_vtable_offset] [line 305, size 8, align 8, offset 1040] [from signed char]
+!61 = metadata !{metadata !"0x24\00signed char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!62 = metadata !{metadata !"0xd\00_shortbuf\00306\008\008\001048\000", metadata !30, metadata !29, metadata !63} ; [ DW_TAG_member ] [_shortbuf] [line 306, size 8, align 8, offset 1048] [from ]
+!63 = metadata !{metadata !"0x1\00\000\008\008\000\000", null, null, metadata !11, metadata !64, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
 !64 = metadata !{metadata !65}
-!65 = metadata !{i32 786465, i64 0, i64 1}        ; [ DW_TAG_subrange_type ] [0, 0]
-!66 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_lock", i32 310, i64 64, i64 64, i64 1088, i32 0, metadata !67} ; [ DW_TAG_member ] [_lock] [line 310, size 64, align 64, offset 1088] [from ]
-!67 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!68 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_offset", i32 319, i64 64, i64 64, i64 1152, i32 0, metadata !69} ; [ DW_TAG_member ] [_offset] [line 319, size 64, align 64, offset 1152] [from __off64_t]
-!69 = metadata !{i32 786454, metadata !30, null, metadata !"__off64_t", i32 142, i64 0, i64 0, i64 0, i32 0, metadata !57} ; [ DW_TAG_typedef ] [__off64_t] [line 142, size 0, align 0, offset 0] [from long int]
-!70 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad1", i32 328, i64 64, i64 64, i64 1216, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad1] [line 328, size 64, align 64, offset 1216] [from ]
-!71 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad2", i32 329, i64 64, i64 64, i64 1280, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad2] [line 329, size 64, align 64, offset 1280] [from ]
-!72 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad3", i32 330, i64 64, i64 64, i64 1344, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad3] [line 330, size 64, align 64, offset 1344] [from ]
-!73 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad4", i32 331, i64 64, i64 64, i64 1408, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad4] [line 331, size 64, align 64, offset 1408] [from ]
-!74 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad5", i32 332, i64 64, i64 64, i64 1472, i32 0, metadata !75} ; [ DW_TAG_member ] [__pad5] [line 332, size 64, align 64, offset 1472] [from size_t]
-!75 = metadata !{i32 786454, metadata !30, null, metadata !"size_t", i32 42, i64 0, i64 0, i64 0, i32 0, metadata !76} ; [ DW_TAG_typedef ] [size_t] [line 42, size 0, align 0, offset 0] [from long unsigned int]
-!76 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!77 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_mode", i32 334, i64 32, i64 32, i64 1536, i32 0, metadata !8} ; [ DW_TAG_member ] [_mode] [line 334, size 32, align 32, offset 1536] [from int]
-!78 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_unused2", i32 336, i64 160, i64 8, i64 1568, i32 0, metadata !79} ; [ DW_TAG_member ] [_unused2] [line 336, size 160, align 8, offset 1568] [from ]
-!79 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !11, metadata !80, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!65 = metadata !{metadata !"0x21\000\001"}        ; [ DW_TAG_subrange_type ] [0, 0]
+!66 = metadata !{metadata !"0xd\00_lock\00310\0064\0064\001088\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [_lock] [line 310, size 64, align 64, offset 1088] [from ]
+!67 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!68 = metadata !{metadata !"0xd\00_offset\00319\0064\0064\001152\000", metadata !30, metadata !29, metadata !69} ; [ DW_TAG_member ] [_offset] [line 319, size 64, align 64, offset 1152] [from __off64_t]
+!69 = metadata !{metadata !"0x16\00__off64_t\00142\000\000\000\000", metadata !30, null, metadata !57} ; [ DW_TAG_typedef ] [__off64_t] [line 142, size 0, align 0, offset 0] [from long int]
+!70 = metadata !{metadata !"0xd\00__pad1\00328\0064\0064\001216\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [__pad1] [line 328, size 64, align 64, offset 1216] [from ]
+!71 = metadata !{metadata !"0xd\00__pad2\00329\0064\0064\001280\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [__pad2] [line 329, size 64, align 64, offset 1280] [from ]
+!72 = metadata !{metadata !"0xd\00__pad3\00330\0064\0064\001344\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [__pad3] [line 330, size 64, align 64, offset 1344] [from ]
+!73 = metadata !{metadata !"0xd\00__pad4\00331\0064\0064\001408\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [__pad4] [line 331, size 64, align 64, offset 1408] [from ]
+!74 = metadata !{metadata !"0xd\00__pad5\00332\0064\0064\001472\000", metadata !30, metadata !29, metadata !75} ; [ DW_TAG_member ] [__pad5] [line 332, size 64, align 64, offset 1472] [from size_t]
+!75 = metadata !{metadata !"0x16\00size_t\0042\000\000\000\000", metadata !30, null, metadata !76} ; [ DW_TAG_typedef ] [size_t] [line 42, size 0, align 0, offset 0] [from long unsigned int]
+!76 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!77 = metadata !{metadata !"0xd\00_mode\00334\0032\0032\001536\000", metadata !30, metadata !29, metadata !8} ; [ DW_TAG_member ] [_mode] [line 334, size 32, align 32, offset 1536] [from int]
+!78 = metadata !{metadata !"0xd\00_unused2\00336\00160\008\001568\000", metadata !30, metadata !29, metadata !79} ; [ DW_TAG_member ] [_unused2] [line 336, size 160, align 8, offset 1568] [from ]
+!79 = metadata !{metadata !"0x1\00\000\00160\008\000\000", null, null, metadata !11, metadata !80, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
 !80 = metadata !{metadata !81}
-!81 = metadata !{i32 786465, i64 0, i64 20}       ; [ DW_TAG_subrange_type ] [0, 19]
-!82 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"verify", metadata !"verify", metadata !"", i32 2388, metadata !83, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !86, i32 2388} ; [ DW_TAG_subprogram ] [line 2388] [local] [def] [verify]
-!83 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !84, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!81 = metadata !{metadata !"0x21\000\0020"}       ; [ DW_TAG_subrange_type ] [0, 19]
+!82 = metadata !{metadata !"0x2e\00verify\00verify\00\002388\001\001\000\006\00256\001\002388", metadata !1, metadata !5, metadata !83, null, null, null, null, metadata !86} ; [ DW_TAG_subprogram ] [line 2388] [local] [def] [verify]
+!83 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !84, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !84 = metadata !{null, metadata !8, metadata !10, metadata !85}
-!85 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from boolean]
+!85 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from boolean]
 !86 = metadata !{metadata !87, metadata !88, metadata !89, metadata !90, metadata !94, metadata !95, metadata !96, metadata !97, metadata !98, metadata !99, metadata !100, metadata !101}
-!87 = metadata !{i32 786689, metadata !82, metadata !"no_time_steps", metadata !5, i32 16779604, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [no_time_steps] [line 2388]
-!88 = metadata !{i32 786689, metadata !82, metadata !"class", metadata !5, i32 33556820, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [class] [line 2388]
-!89 = metadata !{i32 786689, metadata !82, metadata !"verified", metadata !5, i32 50334036, metadata !85, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [verified] [line 2388]
-!90 = metadata !{i32 786688, metadata !82, metadata !"xcrref", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcrref] [line 2397]
-!91 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 320, i64 64, i32 0, i32 0, metadata !20, metadata !92, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 64, offset 0] [from double]
+!87 = metadata !{metadata !"0x101\00no_time_steps\0016779604\000", metadata !82, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [no_time_steps] [line 2388]
+!88 = metadata !{metadata !"0x101\00class\0033556820\000", metadata !82, metadata !5, metadata !10} ; [ DW_TAG_arg_variable ] [class] [line 2388]
+!89 = metadata !{metadata !"0x101\00verified\0050334036\000", metadata !82, metadata !5, metadata !85} ; [ DW_TAG_arg_variable ] [verified] [line 2388]
+!90 = metadata !{metadata !"0x100\00xcrref\002397\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xcrref] [line 2397]
+!91 = metadata !{metadata !"0x1\00\000\00320\0064\000\000", null, null, metadata !20, metadata !92, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 64, offset 0] [from double]
 !92 = metadata !{metadata !93}
-!93 = metadata !{i32 786465, i64 0, i64 5}        ; [ DW_TAG_subrange_type ] [0, 4]
-!94 = metadata !{i32 786688, metadata !82, metadata !"xceref", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xceref] [line 2397]
-!95 = metadata !{i32 786688, metadata !82, metadata !"xcrdif", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcrdif] [line 2397]
-!96 = metadata !{i32 786688, metadata !82, metadata !"xcedif", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcedif] [line 2397]
-!97 = metadata !{i32 786688, metadata !82, metadata !"epsilon", metadata !5, i32 2398, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [epsilon] [line 2398]
-!98 = metadata !{i32 786688, metadata !82, metadata !"xce", metadata !5, i32 2398, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xce] [line 2398]
-!99 = metadata !{i32 786688, metadata !82, metadata !"xcr", metadata !5, i32 2398, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcr] [line 2398]
-!100 = metadata !{i32 786688, metadata !82, metadata !"dtref", metadata !5, i32 2398, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [dtref] [line 2398]
-!101 = metadata !{i32 786688, metadata !82, metadata !"m", metadata !5, i32 2399, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 2399]
-!102 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"rhs_norm", metadata !"rhs_norm", metadata !"", i32 266, metadata !103, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !106, i32 266} ; [ DW_TAG_subprogram ] [line 266] [local] [def] [rhs_norm]
-!103 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !104, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!93 = metadata !{metadata !"0x21\000\005"}        ; [ DW_TAG_subrange_type ] [0, 4]
+!94 = metadata !{metadata !"0x100\00xceref\002397\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xceref] [line 2397]
+!95 = metadata !{metadata !"0x100\00xcrdif\002397\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xcrdif] [line 2397]
+!96 = metadata !{metadata !"0x100\00xcedif\002397\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xcedif] [line 2397]
+!97 = metadata !{metadata !"0x100\00epsilon\002398\000", metadata !82, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [epsilon] [line 2398]
+!98 = metadata !{metadata !"0x100\00xce\002398\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xce] [line 2398]
+!99 = metadata !{metadata !"0x100\00xcr\002398\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xcr] [line 2398]
+!100 = metadata !{metadata !"0x100\00dtref\002398\000", metadata !82, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [dtref] [line 2398]
+!101 = metadata !{metadata !"0x100\00m\002399\000", metadata !82, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 2399]
+!102 = metadata !{metadata !"0x2e\00rhs_norm\00rhs_norm\00\00266\001\001\000\006\00256\001\00266", metadata !1, metadata !5, metadata !103, null, null, null, null, metadata !106} ; [ DW_TAG_subprogram ] [line 266] [local] [def] [rhs_norm]
+!103 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !104, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !104 = metadata !{null, metadata !105}
-!105 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
+!105 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
 !106 = metadata !{metadata !107, metadata !108, metadata !109, metadata !110, metadata !111, metadata !112, metadata !113}
-!107 = metadata !{i32 786689, metadata !102, metadata !"rms", metadata !5, i32 16777482, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [rms] [line 266]
-!108 = metadata !{i32 786688, metadata !102, metadata !"i", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 271]
-!109 = metadata !{i32 786688, metadata !102, metadata !"j", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 271]
-!110 = metadata !{i32 786688, metadata !102, metadata !"k", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 271]
-!111 = metadata !{i32 786688, metadata !102, metadata !"d", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 271]
-!112 = metadata !{i32 786688, metadata !102, metadata !"m", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 271]
-!113 = metadata !{i32 786688, metadata !102, metadata !"add", metadata !5, i32 272, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [add] [line 272]
-!114 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"compute_rhs", metadata !"compute_rhs", metadata !"", i32 1767, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @compute_rhs, null, null, metadata !117, i32 1767} ; [ DW_TAG_subprogram ] [line 1767] [local] [def] [compute_rhs]
-!115 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !116, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!107 = metadata !{metadata !"0x101\00rms\0016777482\000", metadata !102, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [rms] [line 266]
+!108 = metadata !{metadata !"0x100\00i\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 271]
+!109 = metadata !{metadata !"0x100\00j\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 271]
+!110 = metadata !{metadata !"0x100\00k\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 271]
+!111 = metadata !{metadata !"0x100\00d\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [d] [line 271]
+!112 = metadata !{metadata !"0x100\00m\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 271]
+!113 = metadata !{metadata !"0x100\00add\00272\000", metadata !102, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [add] [line 272]
+!114 = metadata !{metadata !"0x2e\00compute_rhs\00compute_rhs\00\001767\001\001\000\006\00256\001\001767", metadata !1, metadata !5, metadata !115, null, void ()* @compute_rhs, null, null, metadata !117} ; [ DW_TAG_subprogram ] [line 1767] [local] [def] [compute_rhs]
+!115 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !116, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !116 = metadata !{null}
 !117 = metadata !{metadata !118, metadata !119, metadata !120, metadata !121, metadata !122, metadata !123, metadata !124, metadata !125, metadata !126, metadata !127, metadata !128, metadata !129, metadata !130, metadata !131}
-!118 = metadata !{i32 786688, metadata !114, metadata !"i", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 1769]
-!119 = metadata !{i32 786688, metadata !114, metadata !"j", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 1769]
-!120 = metadata !{i32 786688, metadata !114, metadata !"k", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 1769]
-!121 = metadata !{i32 786688, metadata !114, metadata !"m", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 1769]
-!122 = metadata !{i32 786688, metadata !114, metadata !"rho_inv", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [rho_inv] [line 1770]
-!123 = metadata !{i32 786688, metadata !114, metadata !"uijk", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [uijk] [line 1770]
-!124 = metadata !{i32 786688, metadata !114, metadata !"up1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [up1] [line 1770]
-!125 = metadata !{i32 786688, metadata !114, metadata !"um1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [um1] [line 1770]
-!126 = metadata !{i32 786688, metadata !114, metadata !"vijk", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vijk] [line 1770]
-!127 = metadata !{i32 786688, metadata !114, metadata !"vp1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vp1] [line 1770]
-!128 = metadata !{i32 786688, metadata !114, metadata !"vm1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vm1] [line 1770]
-!129 = metadata !{i32 786688, metadata !114, metadata !"wijk", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [wijk] [line 1770]
-!130 = metadata !{i32 786688, metadata !114, metadata !"wp1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [wp1] [line 1770]
-!131 = metadata !{i32 786688, metadata !114, metadata !"wm1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [wm1] [line 1770]
-!132 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"error_norm", metadata !"error_norm", metadata !"", i32 225, metadata !103, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !133, i32 225} ; [ DW_TAG_subprogram ] [line 225] [local] [def] [error_norm]
+!118 = metadata !{metadata !"0x100\00i\001769\000", metadata !114, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 1769]
+!119 = metadata !{metadata !"0x100\00j\001769\000", metadata !114, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 1769]
+!120 = metadata !{metadata !"0x100\00k\001769\000", metadata !114, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 1769]
+!121 = metadata !{metadata !"0x100\00m\001769\000", metadata !114, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 1769]
+!122 = metadata !{metadata !"0x100\00rho_inv\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [rho_inv] [line 1770]
+!123 = metadata !{metadata !"0x100\00uijk\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [uijk] [line 1770]
+!124 = metadata !{metadata !"0x100\00up1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [up1] [line 1770]
+!125 = metadata !{metadata !"0x100\00um1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [um1] [line 1770]
+!126 = metadata !{metadata !"0x100\00vijk\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [vijk] [line 1770]
+!127 = metadata !{metadata !"0x100\00vp1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [vp1] [line 1770]
+!128 = metadata !{metadata !"0x100\00vm1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [vm1] [line 1770]
+!129 = metadata !{metadata !"0x100\00wijk\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [wijk] [line 1770]
+!130 = metadata !{metadata !"0x100\00wp1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [wp1] [line 1770]
+!131 = metadata !{metadata !"0x100\00wm1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [wm1] [line 1770]
+!132 = metadata !{metadata !"0x2e\00error_norm\00error_norm\00\00225\001\001\000\006\00256\001\00225", metadata !1, metadata !5, metadata !103, null, null, null, null, metadata !133} ; [ DW_TAG_subprogram ] [line 225] [local] [def] [error_norm]
 !133 = metadata !{metadata !134, metadata !135, metadata !136, metadata !137, metadata !138, metadata !139, metadata !140, metadata !141, metadata !142, metadata !143, metadata !144}
-!134 = metadata !{i32 786689, metadata !132, metadata !"rms", metadata !5, i32 16777441, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [rms] [line 225]
-!135 = metadata !{i32 786688, metadata !132, metadata !"i", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 232]
-!136 = metadata !{i32 786688, metadata !132, metadata !"j", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 232]
-!137 = metadata !{i32 786688, metadata !132, metadata !"k", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 232]
-!138 = metadata !{i32 786688, metadata !132, metadata !"m", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 232]
-!139 = metadata !{i32 786688, metadata !132, metadata !"d", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 232]
-!140 = metadata !{i32 786688, metadata !132, metadata !"xi", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xi] [line 233]
-!141 = metadata !{i32 786688, metadata !132, metadata !"eta", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [eta] [line 233]
-!142 = metadata !{i32 786688, metadata !132, metadata !"zeta", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [zeta] [line 233]
-!143 = metadata !{i32 786688, metadata !132, metadata !"u_exact", metadata !5, i32 233, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [u_exact] [line 233]
-!144 = metadata !{i32 786688, metadata !132, metadata !"add", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [add] [line 233]
-!145 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"exact_solution", metadata !"exact_solution", metadata !"", i32 643, metadata !146, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !148, i32 644} ; [ DW_TAG_subprogram ] [line 643] [local] [def] [scope 644] [exact_solution]
-!146 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !147, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!134 = metadata !{metadata !"0x101\00rms\0016777441\000", metadata !132, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [rms] [line 225]
+!135 = metadata !{metadata !"0x100\00i\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 232]
+!136 = metadata !{metadata !"0x100\00j\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 232]
+!137 = metadata !{metadata !"0x100\00k\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 232]
+!138 = metadata !{metadata !"0x100\00m\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 232]
+!139 = metadata !{metadata !"0x100\00d\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [d] [line 232]
+!140 = metadata !{metadata !"0x100\00xi\00233\000", metadata !132, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [xi] [line 233]
+!141 = metadata !{metadata !"0x100\00eta\00233\000", metadata !132, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [eta] [line 233]
+!142 = metadata !{metadata !"0x100\00zeta\00233\000", metadata !132, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [zeta] [line 233]
+!143 = metadata !{metadata !"0x100\00u_exact\00233\000", metadata !132, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [u_exact] [line 233]
+!144 = metadata !{metadata !"0x100\00add\00233\000", metadata !132, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [add] [line 233]
+!145 = metadata !{metadata !"0x2e\00exact_solution\00exact_solution\00\00643\001\001\000\006\00256\001\00644", metadata !1, metadata !5, metadata !146, null, null, null, null, metadata !148} ; [ DW_TAG_subprogram ] [line 643] [local] [def] [scope 644] [exact_solution]
+!146 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !147, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !147 = metadata !{null, metadata !20, metadata !20, metadata !20, metadata !105}
 !148 = metadata !{metadata !149, metadata !150, metadata !151, metadata !152, metadata !153}
-!149 = metadata !{i32 786689, metadata !145, metadata !"xi", metadata !5, i32 16777859, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [xi] [line 643]
-!150 = metadata !{i32 786689, metadata !145, metadata !"eta", metadata !5, i32 33555075, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [eta] [line 643]
-!151 = metadata !{i32 786689, metadata !145, metadata !"zeta", metadata !5, i32 50332291, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [zeta] [line 643]
-!152 = metadata !{i32 786689, metadata !145, metadata !"dtemp", metadata !5, i32 67109508, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [dtemp] [line 644]
-!153 = metadata !{i32 786688, metadata !145, metadata !"m", metadata !5, i32 653, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 653]
-!154 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"set_constants", metadata !"set_constants", metadata !"", i32 2191, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 2191} ; [ DW_TAG_subprogram ] [line 2191] [local] [def] [set_constants]
-!155 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsinit", metadata !"lhsinit", metadata !"", i32 855, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !156, i32 855} ; [ DW_TAG_subprogram ] [line 855] [local] [def] [lhsinit]
+!149 = metadata !{metadata !"0x101\00xi\0016777859\000", metadata !145, metadata !5, metadata !20} ; [ DW_TAG_arg_variable ] [xi] [line 643]
+!150 = metadata !{metadata !"0x101\00eta\0033555075\000", metadata !145, metadata !5, metadata !20} ; [ DW_TAG_arg_variable ] [eta] [line 643]
+!151 = metadata !{metadata !"0x101\00zeta\0050332291\000", metadata !145, metadata !5, metadata !20} ; [ DW_TAG_arg_variable ] [zeta] [line 643]
+!152 = metadata !{metadata !"0x101\00dtemp\0067109508\000", metadata !145, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [dtemp] [line 644]
+!153 = metadata !{metadata !"0x100\00m\00653\000", metadata !145, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 653]
+!154 = metadata !{metadata !"0x2e\00set_constants\00set_constants\00\002191\001\001\000\006\00256\001\002191", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2191] [local] [def] [set_constants]
+!155 = metadata !{metadata !"0x2e\00lhsinit\00lhsinit\00\00855\001\001\000\006\00256\001\00855", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !156} ; [ DW_TAG_subprogram ] [line 855] [local] [def] [lhsinit]
 !156 = metadata !{metadata !157, metadata !158, metadata !159, metadata !160, metadata !161}
-!157 = metadata !{i32 786688, metadata !155, metadata !"i", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 857]
-!158 = metadata !{i32 786688, metadata !155, metadata !"j", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 857]
-!159 = metadata !{i32 786688, metadata !155, metadata !"k", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 857]
-!160 = metadata !{i32 786688, metadata !155, metadata !"m", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 857]
-!161 = metadata !{i32 786688, metadata !155, metadata !"n", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 857]
-!162 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"initialize", metadata !"initialize", metadata !"", i32 669, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !163, i32 669} ; [ DW_TAG_subprogram ] [line 669] [local] [def] [initialize]
+!157 = metadata !{metadata !"0x100\00i\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 857]
+!158 = metadata !{metadata !"0x100\00j\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 857]
+!159 = metadata !{metadata !"0x100\00k\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 857]
+!160 = metadata !{metadata !"0x100\00m\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 857]
+!161 = metadata !{metadata !"0x100\00n\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n] [line 857]
+!162 = metadata !{metadata !"0x2e\00initialize\00initialize\00\00669\001\001\000\006\00256\001\00669", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !163} ; [ DW_TAG_subprogram ] [line 669] [local] [def] [initialize]
 !163 = metadata !{metadata !164, metadata !165, metadata !166, metadata !167, metadata !168, metadata !169, metadata !170, metadata !171, metadata !172, metadata !173, metadata !174, metadata !179, metadata !180, metadata !181, metadata !182}
-!164 = metadata !{i32 786688, metadata !162, metadata !"i", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 679]
-!165 = metadata !{i32 786688, metadata !162, metadata !"j", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 679]
-!166 = metadata !{i32 786688, metadata !162, metadata !"k", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 679]
-!167 = metadata !{i32 786688, metadata !162, metadata !"m", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 679]
-!168 = metadata !{i32 786688, metadata !162, metadata !"ix", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ix] [line 679]
-!169 = metadata !{i32 786688, metadata !162, metadata !"iy", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [iy] [line 679]
-!170 = metadata !{i32 786688, metadata !162, metadata !"iz", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [iz] [line 679]
-!171 = metadata !{i32 786688, metadata !162, metadata !"xi", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xi] [line 680]
-!172 = metadata !{i32 786688, metadata !162, metadata !"eta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [eta] [line 680]
-!173 = metadata !{i32 786688, metadata !162, metadata !"zeta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [zeta] [line 680]
-!174 = metadata !{i32 786688, metadata !162, metadata !"Pface", metadata !5, i32 680, metadata !175, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Pface] [line 680]
-!175 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1920, i64 64, i32 0, i32 0, metadata !20, metadata !176, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1920, align 64, offset 0] [from double]
+!164 = metadata !{metadata !"0x100\00i\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 679]
+!165 = metadata !{metadata !"0x100\00j\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 679]
+!166 = metadata !{metadata !"0x100\00k\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 679]
+!167 = metadata !{metadata !"0x100\00m\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 679]
+!168 = metadata !{metadata !"0x100\00ix\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [ix] [line 679]
+!169 = metadata !{metadata !"0x100\00iy\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [iy] [line 679]
+!170 = metadata !{metadata !"0x100\00iz\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [iz] [line 679]
+!171 = metadata !{metadata !"0x100\00xi\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [xi] [line 680]
+!172 = metadata !{metadata !"0x100\00eta\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [eta] [line 680]
+!173 = metadata !{metadata !"0x100\00zeta\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [zeta] [line 680]
+!174 = metadata !{metadata !"0x100\00Pface\00680\000", metadata !162, metadata !5, metadata !175} ; [ DW_TAG_auto_variable ] [Pface] [line 680]
+!175 = metadata !{metadata !"0x1\00\000\001920\0064\000\000", null, null, metadata !20, metadata !176, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1920, align 64, offset 0] [from double]
 !176 = metadata !{metadata !177, metadata !178, metadata !93}
-!177 = metadata !{i32 786465, i64 0, i64 2}       ; [ DW_TAG_subrange_type ] [0, 1]
-!178 = metadata !{i32 786465, i64 0, i64 3}       ; [ DW_TAG_subrange_type ] [0, 2]
-!179 = metadata !{i32 786688, metadata !162, metadata !"Pxi", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Pxi] [line 680]
-!180 = metadata !{i32 786688, metadata !162, metadata !"Peta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Peta] [line 680]
-!181 = metadata !{i32 786688, metadata !162, metadata !"Pzeta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Pzeta] [line 680]
-!182 = metadata !{i32 786688, metadata !162, metadata !"temp", metadata !5, i32 680, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [temp] [line 680]
-!183 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"exact_rhs", metadata !"exact_rhs", metadata !"", i32 301, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !184, i32 301} ; [ DW_TAG_subprogram ] [line 301] [local] [def] [exact_rhs]
+!177 = metadata !{metadata !"0x21\000\002"}       ; [ DW_TAG_subrange_type ] [0, 1]
+!178 = metadata !{metadata !"0x21\000\003"}       ; [ DW_TAG_subrange_type ] [0, 2]
+!179 = metadata !{metadata !"0x100\00Pxi\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [Pxi] [line 680]
+!180 = metadata !{metadata !"0x100\00Peta\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [Peta] [line 680]
+!181 = metadata !{metadata !"0x100\00Pzeta\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [Pzeta] [line 680]
+!182 = metadata !{metadata !"0x100\00temp\00680\000", metadata !162, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [temp] [line 680]
+!183 = metadata !{metadata !"0x2e\00exact_rhs\00exact_rhs\00\00301\001\001\000\006\00256\001\00301", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !184} ; [ DW_TAG_subprogram ] [line 301] [local] [def] [exact_rhs]
 !184 = metadata !{metadata !185, metadata !186, metadata !187, metadata !188, metadata !189, metadata !190, metadata !191, metadata !192, metadata !193, metadata !194, metadata !195, metadata !196, metadata !197, metadata !198, metadata !199}
-!185 = metadata !{i32 786688, metadata !183, metadata !"dtemp", metadata !5, i32 310, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [dtemp] [line 310]
-!186 = metadata !{i32 786688, metadata !183, metadata !"xi", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xi] [line 310]
-!187 = metadata !{i32 786688, metadata !183, metadata !"eta", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [eta] [line 310]
-!188 = metadata !{i32 786688, metadata !183, metadata !"zeta", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [zeta] [line 310]
-!189 = metadata !{i32 786688, metadata !183, metadata !"dtpp", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [dtpp] [line 310]
-!190 = metadata !{i32 786688, metadata !183, metadata !"m", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 311]
-!191 = metadata !{i32 786688, metadata !183, metadata !"i", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 311]
-!192 = metadata !{i32 786688, metadata !183, metadata !"j", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 311]
-!193 = metadata !{i32 786688, metadata !183, metadata !"k", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 311]
-!194 = metadata !{i32 786688, metadata !183, metadata !"ip1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ip1] [line 311]
-!195 = metadata !{i32 786688, metadata !183, metadata !"im1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [im1] [line 311]
-!196 = metadata !{i32 786688, metadata !183, metadata !"jp1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [jp1] [line 311]
-!197 = metadata !{i32 786688, metadata !183, metadata !"jm1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [jm1] [line 311]
-!198 = metadata !{i32 786688, metadata !183, metadata !"km1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [km1] [line 311]
-!199 = metadata !{i32 786688, metadata !183, metadata !"kp1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [kp1] [line 311]
-!200 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"adi", metadata !"adi", metadata !"", i32 210, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 210} ; [ DW_TAG_subprogram ] [line 210] [local] [def] [adi]
-!201 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"add", metadata !"add", metadata !"", i32 187, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !202, i32 187} ; [ DW_TAG_subprogram ] [line 187] [local] [def] [add]
+!185 = metadata !{metadata !"0x100\00dtemp\00310\000", metadata !183, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [dtemp] [line 310]
+!186 = metadata !{metadata !"0x100\00xi\00310\000", metadata !183, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [xi] [line 310]
+!187 = metadata !{metadata !"0x100\00eta\00310\000", metadata !183, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [eta] [line 310]
+!188 = metadata !{metadata !"0x100\00zeta\00310\000", metadata !183, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [zeta] [line 310]
+!189 = metadata !{metadata !"0x100\00dtpp\00310\000", metadata !183, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [dtpp] [line 310]
+!190 = metadata !{metadata !"0x100\00m\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 311]
+!191 = metadata !{metadata !"0x100\00i\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 311]
+!192 = metadata !{metadata !"0x100\00j\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 311]
+!193 = metadata !{metadata !"0x100\00k\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 311]
+!194 = metadata !{metadata !"0x100\00ip1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [ip1] [line 311]
+!195 = metadata !{metadata !"0x100\00im1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [im1] [line 311]
+!196 = metadata !{metadata !"0x100\00jp1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [jp1] [line 311]
+!197 = metadata !{metadata !"0x100\00jm1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [jm1] [line 311]
+!198 = metadata !{metadata !"0x100\00km1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [km1] [line 311]
+!199 = metadata !{metadata !"0x100\00kp1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [kp1] [line 311]
+!200 = metadata !{metadata !"0x2e\00adi\00adi\00\00210\001\001\000\006\00256\001\00210", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 210] [local] [def] [adi]
+!201 = metadata !{metadata !"0x2e\00add\00add\00\00187\001\001\000\006\00256\001\00187", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !202} ; [ DW_TAG_subprogram ] [line 187] [local] [def] [add]
 !202 = metadata !{metadata !203, metadata !204, metadata !205, metadata !206}
-!203 = metadata !{i32 786688, metadata !201, metadata !"i", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 193]
-!204 = metadata !{i32 786688, metadata !201, metadata !"j", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 193]
-!205 = metadata !{i32 786688, metadata !201, metadata !"k", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 193]
-!206 = metadata !{i32 786688, metadata !201, metadata !"m", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 193]
-!207 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z_solve", metadata !"z_solve", metadata !"", i32 3457, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 3457} ; [ DW_TAG_subprogram ] [line 3457] [local] [def] [z_solve]
-!208 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z_backsubstitute", metadata !"z_backsubstitute", metadata !"", i32 3480, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !209, i32 3480} ; [ DW_TAG_subprogram ] [line 3480] [local] [def] [z_backsubstitute]
+!203 = metadata !{metadata !"0x100\00i\00193\000", metadata !201, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 193]
+!204 = metadata !{metadata !"0x100\00j\00193\000", metadata !201, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 193]
+!205 = metadata !{metadata !"0x100\00k\00193\000", metadata !201, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 193]
+!206 = metadata !{metadata !"0x100\00m\00193\000", metadata !201, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 193]
+!207 = metadata !{metadata !"0x2e\00z_solve\00z_solve\00\003457\001\001\000\006\00256\001\003457", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3457] [local] [def] [z_solve]
+!208 = metadata !{metadata !"0x2e\00z_backsubstitute\00z_backsubstitute\00\003480\001\001\000\006\00256\001\003480", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !209} ; [ DW_TAG_subprogram ] [line 3480] [local] [def] [z_backsubstitute]
 !209 = metadata !{metadata !210, metadata !211, metadata !212, metadata !213, metadata !214}
-!210 = metadata !{i32 786688, metadata !208, metadata !"i", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3492]
-!211 = metadata !{i32 786688, metadata !208, metadata !"j", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3492]
-!212 = metadata !{i32 786688, metadata !208, metadata !"k", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3492]
-!213 = metadata !{i32 786688, metadata !208, metadata !"m", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 3492]
-!214 = metadata !{i32 786688, metadata !208, metadata !"n", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 3492]
-!215 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z_solve_cell", metadata !"z_solve_cell", metadata !"", i32 3512, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !216, i32 3512} ; [ DW_TAG_subprogram ] [line 3512] [local] [def] [z_solve_cell]
+!210 = metadata !{metadata !"0x100\00i\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3492]
+!211 = metadata !{metadata !"0x100\00j\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 3492]
+!212 = metadata !{metadata !"0x100\00k\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 3492]
+!213 = metadata !{metadata !"0x100\00m\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 3492]
+!214 = metadata !{metadata !"0x100\00n\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n] [line 3492]
+!215 = metadata !{metadata !"0x2e\00z_solve_cell\00z_solve_cell\00\003512\001\001\000\006\00256\001\003512", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !216} ; [ DW_TAG_subprogram ] [line 3512] [local] [def] [z_solve_cell]
 !216 = metadata !{metadata !217, metadata !218, metadata !219, metadata !220}
-!217 = metadata !{i32 786688, metadata !215, metadata !"i", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3527]
-!218 = metadata !{i32 786688, metadata !215, metadata !"j", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3527]
-!219 = metadata !{i32 786688, metadata !215, metadata !"k", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3527]
-!220 = metadata !{i32 786688, metadata !215, metadata !"ksize", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ksize] [line 3527]
-!221 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"binvrhs", metadata !"binvrhs", metadata !"", i32 3154, metadata !222, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !225, i32 3154} ; [ DW_TAG_subprogram ] [line 3154] [local] [def] [binvrhs]
-!222 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !223, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!217 = metadata !{metadata !"0x100\00i\003527\000", metadata !215, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3527]
+!218 = metadata !{metadata !"0x100\00j\003527\000", metadata !215, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 3527]
+!219 = metadata !{metadata !"0x100\00k\003527\000", metadata !215, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 3527]
+!220 = metadata !{metadata !"0x100\00ksize\003527\000", metadata !215, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [ksize] [line 3527]
+!221 = metadata !{metadata !"0x2e\00binvrhs\00binvrhs\00\003154\001\001\000\006\00256\001\003154", metadata !1, metadata !5, metadata !222, null, null, null, null, metadata !225} ; [ DW_TAG_subprogram ] [line 3154] [local] [def] [binvrhs]
+!222 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !223, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !223 = metadata !{null, metadata !224, metadata !105}
-!224 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !91} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!224 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !91} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
 !225 = metadata !{metadata !226, metadata !227, metadata !228, metadata !229}
-!226 = metadata !{i32 786689, metadata !221, metadata !"lhs", metadata !5, i32 16780370, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [lhs] [line 3154]
-!227 = metadata !{i32 786689, metadata !221, metadata !"r", metadata !5, i32 33557586, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 3154]
-!228 = metadata !{i32 786688, metadata !221, metadata !"pivot", metadata !5, i32 3159, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [pivot] [line 3159]
-!229 = metadata !{i32 786688, metadata !221, metadata !"coeff", metadata !5, i32 3159, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [coeff] [line 3159]
-!230 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"matmul_sub", metadata !"matmul_sub", metadata !"", i32 2841, metadata !231, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !233, i32 2842} ; [ DW_TAG_subprogram ] [line 2841] [local] [def] [scope 2842] [matmul_sub]
-!231 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !232, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!226 = metadata !{metadata !"0x101\00lhs\0016780370\000", metadata !221, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [lhs] [line 3154]
+!227 = metadata !{metadata !"0x101\00r\0033557586\000", metadata !221, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [r] [line 3154]
+!228 = metadata !{metadata !"0x100\00pivot\003159\000", metadata !221, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [pivot] [line 3159]
+!229 = metadata !{metadata !"0x100\00coeff\003159\000", metadata !221, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [coeff] [line 3159]
+!230 = metadata !{metadata !"0x2e\00matmul_sub\00matmul_sub\00\002841\001\001\000\006\00256\001\002842", metadata !1, metadata !5, metadata !231, null, null, null, null, metadata !233} ; [ DW_TAG_subprogram ] [line 2841] [local] [def] [scope 2842] [matmul_sub]
+!231 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !232, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !232 = metadata !{null, metadata !224, metadata !224, metadata !224}
 !233 = metadata !{metadata !234, metadata !235, metadata !236, metadata !237}
-!234 = metadata !{i32 786689, metadata !230, metadata !"ablock", metadata !5, i32 16780057, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ablock] [line 2841]
-!235 = metadata !{i32 786689, metadata !230, metadata !"bblock", metadata !5, i32 33557273, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [bblock] [line 2841]
-!236 = metadata !{i32 786689, metadata !230, metadata !"cblock", metadata !5, i32 50334490, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [cblock] [line 2842]
-!237 = metadata !{i32 786688, metadata !230, metadata !"j", metadata !5, i32 2851, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 2851]
-!238 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"matvec_sub", metadata !"matvec_sub", metadata !"", i32 2814, metadata !239, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !241, i32 2814} ; [ DW_TAG_subprogram ] [line 2814] [local] [def] [matvec_sub]
-!239 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !240, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!234 = metadata !{metadata !"0x101\00ablock\0016780057\000", metadata !230, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [ablock] [line 2841]
+!235 = metadata !{metadata !"0x101\00bblock\0033557273\000", metadata !230, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [bblock] [line 2841]
+!236 = metadata !{metadata !"0x101\00cblock\0050334490\000", metadata !230, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [cblock] [line 2842]
+!237 = metadata !{metadata !"0x100\00j\002851\000", metadata !230, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 2851]
+!238 = metadata !{metadata !"0x2e\00matvec_sub\00matvec_sub\00\002814\001\001\000\006\00256\001\002814", metadata !1, metadata !5, metadata !239, null, null, null, null, metadata !241} ; [ DW_TAG_subprogram ] [line 2814] [local] [def] [matvec_sub]
+!239 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !240, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !240 = metadata !{null, metadata !224, metadata !105, metadata !105}
 !241 = metadata !{metadata !242, metadata !243, metadata !244, metadata !245}
-!242 = metadata !{i32 786689, metadata !238, metadata !"ablock", metadata !5, i32 16780030, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ablock] [line 2814]
-!243 = metadata !{i32 786689, metadata !238, metadata !"avec", metadata !5, i32 33557246, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [avec] [line 2814]
-!244 = metadata !{i32 786689, metadata !238, metadata !"bvec", metadata !5, i32 50334462, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [bvec] [line 2814]
-!245 = metadata !{i32 786688, metadata !238, metadata !"i", metadata !5, i32 2823, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2823]
-!246 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"binvcrhs", metadata !"binvcrhs", metadata !"", i32 2885, metadata !247, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !249, i32 2885} ; [ DW_TAG_subprogram ] [line 2885] [local] [def] [binvcrhs]
-!247 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !248, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!242 = metadata !{metadata !"0x101\00ablock\0016780030\000", metadata !238, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [ablock] [line 2814]
+!243 = metadata !{metadata !"0x101\00avec\0033557246\000", metadata !238, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [avec] [line 2814]
+!244 = metadata !{metadata !"0x101\00bvec\0050334462\000", metadata !238, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [bvec] [line 2814]
+!245 = metadata !{metadata !"0x100\00i\002823\000", metadata !238, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 2823]
+!246 = metadata !{metadata !"0x2e\00binvcrhs\00binvcrhs\00\002885\001\001\000\006\00256\001\002885", metadata !1, metadata !5, metadata !247, null, null, null, null, metadata !249} ; [ DW_TAG_subprogram ] [line 2885] [local] [def] [binvcrhs]
+!247 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !248, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !248 = metadata !{null, metadata !224, metadata !224, metadata !105}
 !249 = metadata !{metadata !250, metadata !251, metadata !252, metadata !253, metadata !254}
-!250 = metadata !{i32 786689, metadata !246, metadata !"lhs", metadata !5, i32 16780101, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [lhs] [line 2885]
-!251 = metadata !{i32 786689, metadata !246, metadata !"c", metadata !5, i32 33557317, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [c] [line 2885]
-!252 = metadata !{i32 786689, metadata !246, metadata !"r", metadata !5, i32 50334533, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 2885]
-!253 = metadata !{i32 786688, metadata !246, metadata !"pivot", metadata !5, i32 2890, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [pivot] [line 2890]
-!254 = metadata !{i32 786688, metadata !246, metadata !"coeff", metadata !5, i32 2890, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [coeff] [line 2890]
-!255 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsz", metadata !"lhsz", metadata !"", i32 1475, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !256, i32 1475} ; [ DW_TAG_subprogram ] [line 1475] [local] [def] [lhsz]
+!250 = metadata !{metadata !"0x101\00lhs\0016780101\000", metadata !246, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [lhs] [line 2885]
+!251 = metadata !{metadata !"0x101\00c\0033557317\000", metadata !246, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [c] [line 2885]
+!252 = metadata !{metadata !"0x101\00r\0050334533\000", metadata !246, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [r] [line 2885]
+!253 = metadata !{metadata !"0x100\00pivot\002890\000", metadata !246, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [pivot] [line 2890]
+!254 = metadata !{metadata !"0x100\00coeff\002890\000", metadata !246, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [coeff] [line 2890]
+!255 = metadata !{metadata !"0x2e\00lhsz\00lhsz\00\001475\001\001\000\006\00256\001\001475", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !256} ; [ DW_TAG_subprogram ] [line 1475] [local] [def] [lhsz]
 !256 = metadata !{metadata !257, metadata !258, metadata !259}
-!257 = metadata !{i32 786688, metadata !255, metadata !"i", metadata !5, i32 1484, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 1484]
-!258 = metadata !{i32 786688, metadata !255, metadata !"j", metadata !5, i32 1484, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 1484]
-!259 = metadata !{i32 786688, metadata !255, metadata !"k", metadata !5, i32 1484, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 1484]
-!260 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"y_solve", metadata !"y_solve", metadata !"", i32 3299, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 3299} ; [ DW_TAG_subprogram ] [line 3299] [local] [def] [y_solve]
-!261 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"y_backsubstitute", metadata !"y_backsubstitute", metadata !"", i32 3323, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !262, i32 3323} ; [ DW_TAG_subprogram ] [line 3323] [local] [def] [y_backsubstitute]
+!257 = metadata !{metadata !"0x100\00i\001484\000", metadata !255, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 1484]
+!258 = metadata !{metadata !"0x100\00j\001484\000", metadata !255, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 1484]
+!259 = metadata !{metadata !"0x100\00k\001484\000", metadata !255, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 1484]
+!260 = metadata !{metadata !"0x2e\00y_solve\00y_solve\00\003299\001\001\000\006\00256\001\003299", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3299] [local] [def] [y_solve]
+!261 = metadata !{metadata !"0x2e\00y_backsubstitute\00y_backsubstitute\00\003323\001\001\000\006\00256\001\003323", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !262} ; [ DW_TAG_subprogram ] [line 3323] [local] [def] [y_backsubstitute]
 !262 = metadata !{metadata !263, metadata !264, metadata !265, metadata !266, metadata !267}
-!263 = metadata !{i32 786688, metadata !261, metadata !"i", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3335]
-!264 = metadata !{i32 786688, metadata !261, metadata !"j", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3335]
-!265 = metadata !{i32 786688, metadata !261, metadata !"k", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3335]
-!266 = metadata !{i32 786688, metadata !261, metadata !"m", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 3335]
-!267 = metadata !{i32 786688, metadata !261, metadata !"n", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 3335]
-!268 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"y_solve_cell", metadata !"y_solve_cell", metadata !"", i32 3355, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !269, i32 3355} ; [ DW_TAG_subprogram ] [line 3355] [local] [def] [y_solve_cell]
+!263 = metadata !{metadata !"0x100\00i\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3335]
+!264 = metadata !{metadata !"0x100\00j\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 3335]
+!265 = metadata !{metadata !"0x100\00k\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 3335]
+!266 = metadata !{metadata !"0x100\00m\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 3335]
+!267 = metadata !{metadata !"0x100\00n\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n] [line 3335]
+!268 = metadata !{metadata !"0x2e\00y_solve_cell\00y_solve_cell\00\003355\001\001\000\006\00256\001\003355", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !269} ; [ DW_TAG_subprogram ] [line 3355] [local] [def] [y_solve_cell]
 !269 = metadata !{metadata !270, metadata !271, metadata !272, metadata !273}
-!270 = metadata !{i32 786688, metadata !268, metadata !"i", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3370]
-!271 = metadata !{i32 786688, metadata !268, metadata !"j", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3370]
-!272 = metadata !{i32 786688, metadata !268, metadata !"k", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3370]
-!273 = metadata !{i32 786688, metadata !268, metadata !"jsize", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [jsize] [line 3370]
-!274 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsy", metadata !"lhsy", metadata !"", i32 1181, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !275, i32 1181} ; [ DW_TAG_subprogram ] [line 1181] [local] [def] [lhsy]
+!270 = metadata !{metadata !"0x100\00i\003370\000", metadata !268, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3370]
+!271 = metadata !{metadata !"0x100\00j\003370\000", metadata !268, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 3370]
+!272 = metadata !{metadata !"0x100\00k\003370\000", metadata !268, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 3370]
+!273 = metadata !{metadata !"0x100\00jsize\003370\000", metadata !268, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [jsize] [line 3370]
+!274 = metadata !{metadata !"0x2e\00lhsy\00lhsy\00\001181\001\001\000\006\00256\001\001181", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !275} ; [ DW_TAG_subprogram ] [line 1181] [local] [def] [lhsy]
 !275 = metadata !{metadata !276, metadata !277, metadata !278}
-!276 = metadata !{i32 786688, metadata !274, metadata !"i", metadata !5, i32 1190, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 1190]
-!277 = metadata !{i32 786688, metadata !274, metadata !"j", metadata !5, i32 1190, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 1190]
-!278 = metadata !{i32 786688, metadata !274, metadata !"k", metadata !5, i32 1190, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 1190]
-!279 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x_solve", metadata !"x_solve", metadata !"", i32 2658, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 2658} ; [ DW_TAG_subprogram ] [line 2658] [local] [def] [x_solve]
-!280 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x_backsubstitute", metadata !"x_backsubstitute", metadata !"", i32 2684, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !281, i32 2684} ; [ DW_TAG_subprogram ] [line 2684] [local] [def] [x_backsubstitute]
+!276 = metadata !{metadata !"0x100\00i\001190\000", metadata !274, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 1190]
+!277 = metadata !{metadata !"0x100\00j\001190\000", metadata !274, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 1190]
+!278 = metadata !{metadata !"0x100\00k\001190\000", metadata !274, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 1190]
+!279 = metadata !{metadata !"0x2e\00x_solve\00x_solve\00\002658\001\001\000\006\00256\001\002658", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2658] [local] [def] [x_solve]
+!280 = metadata !{metadata !"0x2e\00x_backsubstitute\00x_backsubstitute\00\002684\001\001\000\006\00256\001\002684", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !281} ; [ DW_TAG_subprogram ] [line 2684] [local] [def] [x_backsubstitute]
 !281 = metadata !{metadata !282, metadata !283, metadata !284, metadata !285, metadata !286}
-!282 = metadata !{i32 786688, metadata !280, metadata !"i", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2696]
-!283 = metadata !{i32 786688, metadata !280, metadata !"j", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 2696]
-!284 = metadata !{i32 786688, metadata !280, metadata !"k", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 2696]
-!285 = metadata !{i32 786688, metadata !280, metadata !"m", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 2696]
-!286 = metadata !{i32 786688, metadata !280, metadata !"n", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 2696]
-!287 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x_solve_cell", metadata !"x_solve_cell", metadata !"", i32 2716, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !288, i32 2716} ; [ DW_TAG_subprogram ] [line 2716] [local] [def] [x_solve_cell]
+!282 = metadata !{metadata !"0x100\00i\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 2696]
+!283 = metadata !{metadata !"0x100\00j\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 2696]
+!284 = metadata !{metadata !"0x100\00k\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 2696]
+!285 = metadata !{metadata !"0x100\00m\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 2696]
+!286 = metadata !{metadata !"0x100\00n\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n] [line 2696]
+!287 = metadata !{metadata !"0x2e\00x_solve_cell\00x_solve_cell\00\002716\001\001\000\006\00256\001\002716", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !288} ; [ DW_TAG_subprogram ] [line 2716] [local] [def] [x_solve_cell]
 !288 = metadata !{metadata !289, metadata !290, metadata !291, metadata !292}
-!289 = metadata !{i32 786688, metadata !287, metadata !"i", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2728]
-!290 = metadata !{i32 786688, metadata !287, metadata !"j", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 2728]
-!291 = metadata !{i32 786688, metadata !287, metadata !"k", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 2728]
-!292 = metadata !{i32 786688, metadata !287, metadata !"isize", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [isize] [line 2728]
-!293 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsx", metadata !"lhsx", metadata !"", i32 898, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !294, i32 898} ; [ DW_TAG_subprogram ] [line 898] [local] [def] [lhsx]
+!289 = metadata !{metadata !"0x100\00i\002728\000", metadata !287, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 2728]
+!290 = metadata !{metadata !"0x100\00j\002728\000", metadata !287, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 2728]
+!291 = metadata !{metadata !"0x100\00k\002728\000", metadata !287, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 2728]
+!292 = metadata !{metadata !"0x100\00isize\002728\000", metadata !287, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [isize] [line 2728]
+!293 = metadata !{metadata !"0x2e\00lhsx\00lhsx\00\00898\001\001\000\006\00256\001\00898", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !294} ; [ DW_TAG_subprogram ] [line 898] [local] [def] [lhsx]
 !294 = metadata !{metadata !295, metadata !296, metadata !297}
-!295 = metadata !{i32 786688, metadata !293, metadata !"i", metadata !5, i32 907, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 907]
-!296 = metadata !{i32 786688, metadata !293, metadata !"j", metadata !5, i32 907, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 907]
-!297 = metadata !{i32 786688, metadata !293, metadata !"k", metadata !5, i32 907, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 907]
+!295 = metadata !{metadata !"0x100\00i\00907\000", metadata !293, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 907]
+!296 = metadata !{metadata !"0x100\00j\00907\000", metadata !293, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 907]
+!297 = metadata !{metadata !"0x100\00k\00907\000", metadata !293, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 907]
 !298 = metadata !{metadata !299, metadata !304, metadata !305, metadata !309, metadata !310, metadata !311, metadata !312, metadata !313, metadata !314, metadata !315, metadata !316, metadata !317, metadata !318, metadata !319, metadata !320, metadata !321, metadata !322, metadata !323, metadata !324, metadata !325, metadata !326, metadata !327, metadata !328, metadata !329, metadata !330, metadata !331, metadata !332, metadata !333, metadata !334, metadata !335, metadata !336, metadata !337, metadata !338, metadata !339, metadata !340, metadata !341, metadata !342, metadata !343, metadata !347, metadata !350, metadata !351, metadata !352, metadata !353, metadata !354, metadata !355, metadata !356, metadata !360, metadata !361, metadata !362, metadata !363, metadata !364, metadata !365, metadata !366, metadata !367, metadata !368, metadata !369, metadata !370, metadata !371, metadata !372, metadata !373, metadata !374, metadata !375, metadata !376, metadata !377, metadata !378, metadata !379, metadata !380, metadata !381, metadata !382, metadata !383, metadata !384, metadata !385, metadata !386, metadata !387, metadata !388, metadata !389, metadata !390, metadata !391, metadata !392, metadata !393, metadata !394, metadata !395, metadata !396, metadata !397, metadata !398, metadata !399, metadata !400, metadata !401, metadata !402, metadata !403, metadata !404, metadata !405, metadata !406, metadata !407, metadata !408, metadata !409, metadata !410, metadata !411, metadata !412, metadata !413, metadata !414, metadata !415, metadata !416, metadata !417, metadata !418, metadata !419, metadata !422, metadata !426, metadata !427, metadata !430, metadata !431, metadata !434, metadata !435, metadata !436, metadata !437}
-!299 = metadata !{i32 786484, i32 0, null, metadata !"grid_points", metadata !"grid_points", metadata !"", metadata !300, i32 28, metadata !302, i32 1, i32 1, [3 x i32]* @grid_points, null} ; [ DW_TAG_variable ] [grid_points] [line 28] [local] [def]
-!300 = metadata !{i32 786473, metadata !301}      ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/./header.h]
+!299 = metadata !{metadata !"0x34\00grid_points\00grid_points\00\0028\001\001", null, metadata !300, metadata !302, [3 x i32]* @grid_points, null} ; [ DW_TAG_variable ] [grid_points] [line 28] [local] [def]
+!300 = metadata !{metadata !"0x29", metadata !301}      ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/./header.h]
 !301 = metadata !{metadata !"./header.h", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
-!302 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 96, i64 32, i32 0, i32 0, metadata !8, metadata !303, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 96, align 32, offset 0] [from int]
+!302 = metadata !{metadata !"0x1\00\000\0096\0032\000\000", null, null, metadata !8, metadata !303, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 96, align 32, offset 0] [from int]
 !303 = metadata !{metadata !178}
-!304 = metadata !{i32 786484, i32 0, null, metadata !"dt", metadata !"dt", metadata !"", metadata !300, i32 35, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dt] [line 35] [local] [def]
-!305 = metadata !{i32 786484, i32 0, null, metadata !"rhs", metadata !"rhs", metadata !"", metadata !300, i32 68, metadata !306, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [rhs] [line 68] [local] [def]
-!306 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1385839040, i64 64, i32 0, i32 0, metadata !20, metadata !307, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1385839040, align 64, offset 0] [from double]
+!304 = metadata !{metadata !"0x34\00dt\00dt\00\0035\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dt] [line 35] [local] [def]
+!305 = metadata !{metadata !"0x34\00rhs\00rhs\00\0068\001\001", null, metadata !300, metadata !306, null, null} ; [ DW_TAG_variable ] [rhs] [line 68] [local] [def]
+!306 = metadata !{metadata !"0x1\00\000\001385839040\0064\000\000", null, null, metadata !20, metadata !307, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1385839040, align 64, offset 0] [from double]
 !307 = metadata !{metadata !308, metadata !308, metadata !308, metadata !93}
-!308 = metadata !{i32 786465, i64 0, i64 163}     ; [ DW_TAG_subrange_type ] [0, 162]
-!309 = metadata !{i32 786484, i32 0, null, metadata !"zzcon5", metadata !"zzcon5", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon5] [line 42] [local] [def]
-!310 = metadata !{i32 786484, i32 0, null, metadata !"zzcon4", metadata !"zzcon4", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon4] [line 42] [local] [def]
-!311 = metadata !{i32 786484, i32 0, null, metadata !"zzcon3", metadata !"zzcon3", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon3] [line 42] [local] [def]
-!312 = metadata !{i32 786484, i32 0, null, metadata !"dz5tz1", metadata !"dz5tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz5tz1] [line 43] [local] [def]
-!313 = metadata !{i32 786484, i32 0, null, metadata !"dz4tz1", metadata !"dz4tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz4tz1] [line 43] [local] [def]
-!314 = metadata !{i32 786484, i32 0, null, metadata !"dz3tz1", metadata !"dz3tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz3tz1] [line 43] [local] [def]
-!315 = metadata !{i32 786484, i32 0, null, metadata !"zzcon2", metadata !"zzcon2", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon2] [line 42] [local] [def]
-!316 = metadata !{i32 786484, i32 0, null, metadata !"dz2tz1", metadata !"dz2tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz2tz1] [line 43] [local] [def]
-!317 = metadata !{i32 786484, i32 0, null, metadata !"tz2", metadata !"tz2", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tz2] [line 31] [local] [def]
-!318 = metadata !{i32 786484, i32 0, null, metadata !"dz1tz1", metadata !"dz1tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz1tz1] [line 43] [local] [def]
-!319 = metadata !{i32 786484, i32 0, null, metadata !"yycon5", metadata !"yycon5", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon5] [line 40] [local] [def]
-!320 = metadata !{i32 786484, i32 0, null, metadata !"yycon4", metadata !"yycon4", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon4] [line 40] [local] [def]
-!321 = metadata !{i32 786484, i32 0, null, metadata !"yycon3", metadata !"yycon3", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon3] [line 40] [local] [def]
-!322 = metadata !{i32 786484, i32 0, null, metadata !"dy5ty1", metadata !"dy5ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy5ty1] [line 41] [local] [def]
-!323 = metadata !{i32 786484, i32 0, null, metadata !"dy4ty1", metadata !"dy4ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy4ty1] [line 41] [local] [def]
-!324 = metadata !{i32 786484, i32 0, null, metadata !"dy3ty1", metadata !"dy3ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy3ty1] [line 41] [local] [def]
-!325 = metadata !{i32 786484, i32 0, null, metadata !"yycon2", metadata !"yycon2", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon2] [line 40] [local] [def]
-!326 = metadata !{i32 786484, i32 0, null, metadata !"dy2ty1", metadata !"dy2ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy2ty1] [line 41] [local] [def]
-!327 = metadata !{i32 786484, i32 0, null, metadata !"ty2", metadata !"ty2", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ty2] [line 31] [local] [def]
-!328 = metadata !{i32 786484, i32 0, null, metadata !"dy1ty1", metadata !"dy1ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy1ty1] [line 41] [local] [def]
-!329 = metadata !{i32 786484, i32 0, null, metadata !"dssp", metadata !"dssp", metadata !"", metadata !300, i32 35, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dssp] [line 35] [local] [def]
-!330 = metadata !{i32 786484, i32 0, null, metadata !"c1", metadata !"c1", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1] [line 45] [local] [def]
-!331 = metadata !{i32 786484, i32 0, null, metadata !"xxcon5", metadata !"xxcon5", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon5] [line 38] [local] [def]
-!332 = metadata !{i32 786484, i32 0, null, metadata !"xxcon4", metadata !"xxcon4", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon4] [line 38] [local] [def]
-!333 = metadata !{i32 786484, i32 0, null, metadata !"xxcon3", metadata !"xxcon3", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon3] [line 38] [local] [def]
-!334 = metadata !{i32 786484, i32 0, null, metadata !"dx5tx1", metadata !"dx5tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx5tx1] [line 39] [local] [def]
-!335 = metadata !{i32 786484, i32 0, null, metadata !"dx4tx1", metadata !"dx4tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx4tx1] [line 39] [local] [def]
-!336 = metadata !{i32 786484, i32 0, null, metadata !"dx3tx1", metadata !"dx3tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx3tx1] [line 39] [local] [def]
-!337 = metadata !{i32 786484, i32 0, null, metadata !"c2", metadata !"c2", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2] [line 45] [local] [def]
-!338 = metadata !{i32 786484, i32 0, null, metadata !"con43", metadata !"con43", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [con43] [line 48] [local] [def]
-!339 = metadata !{i32 786484, i32 0, null, metadata !"xxcon2", metadata !"xxcon2", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon2] [line 38] [local] [def]
-!340 = metadata !{i32 786484, i32 0, null, metadata !"dx2tx1", metadata !"dx2tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx2tx1] [line 39] [local] [def]
-!341 = metadata !{i32 786484, i32 0, null, metadata !"tx2", metadata !"tx2", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tx2] [line 31] [local] [def]
-!342 = metadata !{i32 786484, i32 0, null, metadata !"dx1tx1", metadata !"dx1tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx1tx1] [line 39] [local] [def]
-!343 = metadata !{i32 786484, i32 0, null, metadata !"forcing", metadata !"forcing", metadata !"", metadata !300, i32 66, metadata !344, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [forcing] [line 66] [local] [def]
-!344 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1663006848, i64 64, i32 0, i32 0, metadata !20, metadata !345, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1663006848, align 64, offset 0] [from double]
+!308 = metadata !{metadata !"0x21\000\00163"}     ; [ DW_TAG_subrange_type ] [0, 162]
+!309 = metadata !{metadata !"0x34\00zzcon5\00zzcon5\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon5] [line 42] [local] [def]
+!310 = metadata !{metadata !"0x34\00zzcon4\00zzcon4\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon4] [line 42] [local] [def]
+!311 = metadata !{metadata !"0x34\00zzcon3\00zzcon3\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon3] [line 42] [local] [def]
+!312 = metadata !{metadata !"0x34\00dz5tz1\00dz5tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz5tz1] [line 43] [local] [def]
+!313 = metadata !{metadata !"0x34\00dz4tz1\00dz4tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz4tz1] [line 43] [local] [def]
+!314 = metadata !{metadata !"0x34\00dz3tz1\00dz3tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz3tz1] [line 43] [local] [def]
+!315 = metadata !{metadata !"0x34\00zzcon2\00zzcon2\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon2] [line 42] [local] [def]
+!316 = metadata !{metadata !"0x34\00dz2tz1\00dz2tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz2tz1] [line 43] [local] [def]
+!317 = metadata !{metadata !"0x34\00tz2\00tz2\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tz2] [line 31] [local] [def]
+!318 = metadata !{metadata !"0x34\00dz1tz1\00dz1tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz1tz1] [line 43] [local] [def]
+!319 = metadata !{metadata !"0x34\00yycon5\00yycon5\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon5] [line 40] [local] [def]
+!320 = metadata !{metadata !"0x34\00yycon4\00yycon4\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon4] [line 40] [local] [def]
+!321 = metadata !{metadata !"0x34\00yycon3\00yycon3\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon3] [line 40] [local] [def]
+!322 = metadata !{metadata !"0x34\00dy5ty1\00dy5ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy5ty1] [line 41] [local] [def]
+!323 = metadata !{metadata !"0x34\00dy4ty1\00dy4ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy4ty1] [line 41] [local] [def]
+!324 = metadata !{metadata !"0x34\00dy3ty1\00dy3ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy3ty1] [line 41] [local] [def]
+!325 = metadata !{metadata !"0x34\00yycon2\00yycon2\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon2] [line 40] [local] [def]
+!326 = metadata !{metadata !"0x34\00dy2ty1\00dy2ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy2ty1] [line 41] [local] [def]
+!327 = metadata !{metadata !"0x34\00ty2\00ty2\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [ty2] [line 31] [local] [def]
+!328 = metadata !{metadata !"0x34\00dy1ty1\00dy1ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy1ty1] [line 41] [local] [def]
+!329 = metadata !{metadata !"0x34\00dssp\00dssp\00\0035\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dssp] [line 35] [local] [def]
+!330 = metadata !{metadata !"0x34\00c1\00c1\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c1] [line 45] [local] [def]
+!331 = metadata !{metadata !"0x34\00xxcon5\00xxcon5\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon5] [line 38] [local] [def]
+!332 = metadata !{metadata !"0x34\00xxcon4\00xxcon4\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon4] [line 38] [local] [def]
+!333 = metadata !{metadata !"0x34\00xxcon3\00xxcon3\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon3] [line 38] [local] [def]
+!334 = metadata !{metadata !"0x34\00dx5tx1\00dx5tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx5tx1] [line 39] [local] [def]
+!335 = metadata !{metadata !"0x34\00dx4tx1\00dx4tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx4tx1] [line 39] [local] [def]
+!336 = metadata !{metadata !"0x34\00dx3tx1\00dx3tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx3tx1] [line 39] [local] [def]
+!337 = metadata !{metadata !"0x34\00c2\00c2\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2] [line 45] [local] [def]
+!338 = metadata !{metadata !"0x34\00con43\00con43\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [con43] [line 48] [local] [def]
+!339 = metadata !{metadata !"0x34\00xxcon2\00xxcon2\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon2] [line 38] [local] [def]
+!340 = metadata !{metadata !"0x34\00dx2tx1\00dx2tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx2tx1] [line 39] [local] [def]
+!341 = metadata !{metadata !"0x34\00tx2\00tx2\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tx2] [line 31] [local] [def]
+!342 = metadata !{metadata !"0x34\00dx1tx1\00dx1tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx1tx1] [line 39] [local] [def]
+!343 = metadata !{metadata !"0x34\00forcing\00forcing\00\0066\001\001", null, metadata !300, metadata !344, null, null} ; [ DW_TAG_variable ] [forcing] [line 66] [local] [def]
+!344 = metadata !{metadata !"0x1\00\000\001663006848\0064\000\000", null, null, metadata !20, metadata !345, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1663006848, align 64, offset 0] [from double]
 !345 = metadata !{metadata !308, metadata !308, metadata !308, metadata !346}
-!346 = metadata !{i32 786465, i64 0, i64 6}       ; [ DW_TAG_subrange_type ] [0, 5]
-!347 = metadata !{i32 786484, i32 0, null, metadata !"qs", metadata !"qs", metadata !"", metadata !300, i32 63, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [qs] [line 63] [local] [def]
-!348 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 277167808, i64 64, i32 0, i32 0, metadata !20, metadata !349, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 277167808, align 64, offset 0] [from double]
+!346 = metadata !{metadata !"0x21\000\006"}       ; [ DW_TAG_subrange_type ] [0, 5]
+!347 = metadata !{metadata !"0x34\00qs\00qs\00\0063\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [qs] [line 63] [local] [def]
+!348 = metadata !{metadata !"0x1\00\000\00277167808\0064\000\000", null, null, metadata !20, metadata !349, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 277167808, align 64, offset 0] [from double]
 !349 = metadata !{metadata !308, metadata !308, metadata !308}
-!350 = metadata !{i32 786484, i32 0, null, metadata !"square", metadata !"square", metadata !"", metadata !300, i32 65, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [square] [line 65] [local] [def]
-!351 = metadata !{i32 786484, i32 0, null, metadata !"ws", metadata !"ws", metadata !"", metadata !300, i32 62, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ws] [line 62] [local] [def]
-!352 = metadata !{i32 786484, i32 0, null, metadata !"vs", metadata !"vs", metadata !"", metadata !300, i32 61, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [vs] [line 61] [local] [def]
-!353 = metadata !{i32 786484, i32 0, null, metadata !"us", metadata !"us", metadata !"", metadata !300, i32 60, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [us] [line 60] [local] [def]
-!354 = metadata !{i32 786484, i32 0, null, metadata !"rho_i", metadata !"rho_i", metadata !"", metadata !300, i32 64, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [rho_i] [line 64] [local] [def]
-!355 = metadata !{i32 786484, i32 0, null, metadata !"u", metadata !"u", metadata !"", metadata !300, i32 67, metadata !306, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [u] [line 67] [local] [def]
-!356 = metadata !{i32 786484, i32 0, null, metadata !"ce", metadata !"ce", metadata !"", metadata !300, i32 36, metadata !357, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ce] [line 36] [local] [def]
-!357 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 4160, i64 64, i32 0, i32 0, metadata !20, metadata !358, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 4160, align 64, offset 0] [from double]
+!350 = metadata !{metadata !"0x34\00square\00square\00\0065\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [square] [line 65] [local] [def]
+!351 = metadata !{metadata !"0x34\00ws\00ws\00\0062\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [ws] [line 62] [local] [def]
+!352 = metadata !{metadata !"0x34\00vs\00vs\00\0061\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [vs] [line 61] [local] [def]
+!353 = metadata !{metadata !"0x34\00us\00us\00\0060\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [us] [line 60] [local] [def]
+!354 = metadata !{metadata !"0x34\00rho_i\00rho_i\00\0064\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [rho_i] [line 64] [local] [def]
+!355 = metadata !{metadata !"0x34\00u\00u\00\0067\001\001", null, metadata !300, metadata !306, null, null} ; [ DW_TAG_variable ] [u] [line 67] [local] [def]
+!356 = metadata !{metadata !"0x34\00ce\00ce\00\0036\001\001", null, metadata !300, metadata !357, null, null} ; [ DW_TAG_variable ] [ce] [line 36] [local] [def]
+!357 = metadata !{metadata !"0x1\00\000\004160\0064\000\000", null, null, metadata !20, metadata !358, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 4160, align 64, offset 0] [from double]
 !358 = metadata !{metadata !93, metadata !359}
-!359 = metadata !{i32 786465, i64 0, i64 13}      ; [ DW_TAG_subrange_type ] [0, 12]
-!360 = metadata !{i32 786484, i32 0, null, metadata !"dnzm1", metadata !"dnzm1", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dnzm1] [line 44] [local] [def]
-!361 = metadata !{i32 786484, i32 0, null, metadata !"dnym1", metadata !"dnym1", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dnym1] [line 44] [local] [def]
-!362 = metadata !{i32 786484, i32 0, null, metadata !"dnxm1", metadata !"dnxm1", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dnxm1] [line 44] [local] [def]
-!363 = metadata !{i32 786484, i32 0, null, metadata !"zzcon1", metadata !"zzcon1", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon1] [line 42] [local] [def]
-!364 = metadata !{i32 786484, i32 0, null, metadata !"yycon1", metadata !"yycon1", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon1] [line 40] [local] [def]
-!365 = metadata !{i32 786484, i32 0, null, metadata !"xxcon1", metadata !"xxcon1", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon1] [line 38] [local] [def]
-!366 = metadata !{i32 786484, i32 0, null, metadata !"con16", metadata !"con16", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [con16] [line 48] [local] [def]
-!367 = metadata !{i32 786484, i32 0, null, metadata !"c2iv", metadata !"c2iv", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2iv] [line 48] [local] [def]
-!368 = metadata !{i32 786484, i32 0, null, metadata !"c3c4tz3", metadata !"c3c4tz3", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4tz3] [line 48] [local] [def]
-!369 = metadata !{i32 786484, i32 0, null, metadata !"c3c4ty3", metadata !"c3c4ty3", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4ty3] [line 48] [local] [def]
-!370 = metadata !{i32 786484, i32 0, null, metadata !"c3c4tx3", metadata !"c3c4tx3", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4tx3] [line 48] [local] [def]
-!371 = metadata !{i32 786484, i32 0, null, metadata !"comz6", metadata !"comz6", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz6] [line 47] [local] [def]
-!372 = metadata !{i32 786484, i32 0, null, metadata !"comz5", metadata !"comz5", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz5] [line 47] [local] [def]
-!373 = metadata !{i32 786484, i32 0, null, metadata !"comz4", metadata !"comz4", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz4] [line 47] [local] [def]
-!374 = metadata !{i32 786484, i32 0, null, metadata !"comz1", metadata !"comz1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz1] [line 47] [local] [def]
-!375 = metadata !{i32 786484, i32 0, null, metadata !"dtdssp", metadata !"dtdssp", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dtdssp] [line 45] [local] [def]
-!376 = metadata !{i32 786484, i32 0, null, metadata !"c2dttz1", metadata !"c2dttz1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2dttz1] [line 47] [local] [def]
-!377 = metadata !{i32 786484, i32 0, null, metadata !"c2dtty1", metadata !"c2dtty1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2dtty1] [line 47] [local] [def]
-!378 = metadata !{i32 786484, i32 0, null, metadata !"c2dttx1", metadata !"c2dttx1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2dttx1] [line 47] [local] [def]
-!379 = metadata !{i32 786484, i32 0, null, metadata !"dttz2", metadata !"dttz2", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttz2] [line 46] [local] [def]
-!380 = metadata !{i32 786484, i32 0, null, metadata !"dttz1", metadata !"dttz1", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttz1] [line 46] [local] [def]
-!381 = metadata !{i32 786484, i32 0, null, metadata !"dtty2", metadata !"dtty2", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dtty2] [line 46] [local] [def]
-!382 = metadata !{i32 786484, i32 0, null, metadata !"dtty1", metadata !"dtty1", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dtty1] [line 46] [local] [def]
-!383 = metadata !{i32 786484, i32 0, null, metadata !"dttx2", metadata !"dttx2", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttx2] [line 46] [local] [def]
-!384 = metadata !{i32 786484, i32 0, null, metadata !"dttx1", metadata !"dttx1", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttx1] [line 46] [local] [def]
-!385 = metadata !{i32 786484, i32 0, null, metadata !"c5dssp", metadata !"c5dssp", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c5dssp] [line 45] [local] [def]
-!386 = metadata !{i32 786484, i32 0, null, metadata !"c4dssp", metadata !"c4dssp", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c4dssp] [line 45] [local] [def]
-!387 = metadata !{i32 786484, i32 0, null, metadata !"dzmax", metadata !"dzmax", metadata !"", metadata !300, i32 37, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dzmax] [line 37] [local] [def]
-!388 = metadata !{i32 786484, i32 0, null, metadata !"dymax", metadata !"dymax", metadata !"", metadata !300, i32 37, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dymax] [line 37] [local] [def]
-!389 = metadata !{i32 786484, i32 0, null, metadata !"dxmax", metadata !"dxmax", metadata !"", metadata !300, i32 37, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dxmax] [line 37] [local] [def]
-!390 = metadata !{i32 786484, i32 0, null, metadata !"dz5", metadata !"dz5", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz5] [line 34] [local] [def]
-!391 = metadata !{i32 786484, i32 0, null, metadata !"dz4", metadata !"dz4", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz4] [line 34] [local] [def]
-!392 = metadata !{i32 786484, i32 0, null, metadata !"dz3", metadata !"dz3", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz3] [line 34] [local] [def]
-!393 = metadata !{i32 786484, i32 0, null, metadata !"dz2", metadata !"dz2", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz2] [line 34] [local] [def]
-!394 = metadata !{i32 786484, i32 0, null, metadata !"dz1", metadata !"dz1", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz1] [line 34] [local] [def]
-!395 = metadata !{i32 786484, i32 0, null, metadata !"dy5", metadata !"dy5", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy5] [line 33] [local] [def]
-!396 = metadata !{i32 786484, i32 0, null, metadata !"dy4", metadata !"dy4", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy4] [line 33] [local] [def]
-!397 = metadata !{i32 786484, i32 0, null, metadata !"dy3", metadata !"dy3", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy3] [line 33] [local] [def]
-!398 = metadata !{i32 786484, i32 0, null, metadata !"dy2", metadata !"dy2", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy2] [line 33] [local] [def]
-!399 = metadata !{i32 786484, i32 0, null, metadata !"dy1", metadata !"dy1", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy1] [line 33] [local] [def]
-!400 = metadata !{i32 786484, i32 0, null, metadata !"dx5", metadata !"dx5", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx5] [line 32] [local] [def]
-!401 = metadata !{i32 786484, i32 0, null, metadata !"dx4", metadata !"dx4", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx4] [line 32] [local] [def]
-!402 = metadata !{i32 786484, i32 0, null, metadata !"dx3", metadata !"dx3", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx3] [line 32] [local] [def]
-!403 = metadata !{i32 786484, i32 0, null, metadata !"dx2", metadata !"dx2", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx2] [line 32] [local] [def]
-!404 = metadata !{i32 786484, i32 0, null, metadata !"dx1", metadata !"dx1", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx1] [line 32] [local] [def]
-!405 = metadata !{i32 786484, i32 0, null, metadata !"tz3", metadata !"tz3", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tz3] [line 31] [local] [def]
-!406 = metadata !{i32 786484, i32 0, null, metadata !"tz1", metadata !"tz1", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tz1] [line 31] [local] [def]
-!407 = metadata !{i32 786484, i32 0, null, metadata !"ty3", metadata !"ty3", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ty3] [line 31] [local] [def]
-!408 = metadata !{i32 786484, i32 0, null, metadata !"ty1", metadata !"ty1", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ty1] [line 31] [local] [def]
-!409 = metadata !{i32 786484, i32 0, null, metadata !"tx3", metadata !"tx3", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tx3] [line 31] [local] [def]
-!410 = metadata !{i32 786484, i32 0, null, metadata !"tx1", metadata !"tx1", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tx1] [line 31] [local] [def]
-!411 = metadata !{i32 786484, i32 0, null, metadata !"conz1", metadata !"conz1", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [conz1] [line 45] [local] [def]
-!412 = metadata !{i32 786484, i32 0, null, metadata !"c1345", metadata !"c1345", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1345] [line 44] [local] [def]
-!413 = metadata !{i32 786484, i32 0, null, metadata !"c3c4", metadata !"c3c4", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4] [line 44] [local] [def]
-!414 = metadata !{i32 786484, i32 0, null, metadata !"c1c5", metadata !"c1c5", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1c5] [line 44] [local] [def]
-!415 = metadata !{i32 786484, i32 0, null, metadata !"c1c2", metadata !"c1c2", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1c2] [line 44] [local] [def]
-!416 = metadata !{i32 786484, i32 0, null, metadata !"c5", metadata !"c5", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c5] [line 45] [local] [def]
-!417 = metadata !{i32 786484, i32 0, null, metadata !"c4", metadata !"c4", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c4] [line 45] [local] [def]
-!418 = metadata !{i32 786484, i32 0, null, metadata !"c3", metadata !"c3", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3] [line 45] [local] [def]
-!419 = metadata !{i32 786484, i32 0, null, metadata !"lhs", metadata !"lhs", metadata !"", metadata !300, i32 69, metadata !420, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [lhs] [line 69] [local] [def]
-!420 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 20787585600, i64 64, i32 0, i32 0, metadata !20, metadata !421, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 20787585600, align 64, offset 0] [from double]
+!359 = metadata !{metadata !"0x21\000\0013"}      ; [ DW_TAG_subrange_type ] [0, 12]
+!360 = metadata !{metadata !"0x34\00dnzm1\00dnzm1\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dnzm1] [line 44] [local] [def]
+!361 = metadata !{metadata !"0x34\00dnym1\00dnym1\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dnym1] [line 44] [local] [def]
+!362 = metadata !{metadata !"0x34\00dnxm1\00dnxm1\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dnxm1] [line 44] [local] [def]
+!363 = metadata !{metadata !"0x34\00zzcon1\00zzcon1\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon1] [line 42] [local] [def]
+!364 = metadata !{metadata !"0x34\00yycon1\00yycon1\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon1] [line 40] [local] [def]
+!365 = metadata !{metadata !"0x34\00xxcon1\00xxcon1\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon1] [line 38] [local] [def]
+!366 = metadata !{metadata !"0x34\00con16\00con16\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [con16] [line 48] [local] [def]
+!367 = metadata !{metadata !"0x34\00c2iv\00c2iv\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2iv] [line 48] [local] [def]
+!368 = metadata !{metadata !"0x34\00c3c4tz3\00c3c4tz3\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3c4tz3] [line 48] [local] [def]
+!369 = metadata !{metadata !"0x34\00c3c4ty3\00c3c4ty3\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3c4ty3] [line 48] [local] [def]
+!370 = metadata !{metadata !"0x34\00c3c4tx3\00c3c4tx3\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3c4tx3] [line 48] [local] [def]
+!371 = metadata !{metadata !"0x34\00comz6\00comz6\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [comz6] [line 47] [local] [def]
+!372 = metadata !{metadata !"0x34\00comz5\00comz5\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [comz5] [line 47] [local] [def]
+!373 = metadata !{metadata !"0x34\00comz4\00comz4\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [comz4] [line 47] [local] [def]
+!374 = metadata !{metadata !"0x34\00comz1\00comz1\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [comz1] [line 47] [local] [def]
+!375 = metadata !{metadata !"0x34\00dtdssp\00dtdssp\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dtdssp] [line 45] [local] [def]
+!376 = metadata !{metadata !"0x34\00c2dttz1\00c2dttz1\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2dttz1] [line 47] [local] [def]
+!377 = metadata !{metadata !"0x34\00c2dtty1\00c2dtty1\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2dtty1] [line 47] [local] [def]
+!378 = metadata !{metadata !"0x34\00c2dttx1\00c2dttx1\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2dttx1] [line 47] [local] [def]
+!379 = metadata !{metadata !"0x34\00dttz2\00dttz2\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dttz2] [line 46] [local] [def]
+!380 = metadata !{metadata !"0x34\00dttz1\00dttz1\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dttz1] [line 46] [local] [def]
+!381 = metadata !{metadata !"0x34\00dtty2\00dtty2\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dtty2] [line 46] [local] [def]
+!382 = metadata !{metadata !"0x34\00dtty1\00dtty1\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dtty1] [line 46] [local] [def]
+!383 = metadata !{metadata !"0x34\00dttx2\00dttx2\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dttx2] [line 46] [local] [def]
+!384 = metadata !{metadata !"0x34\00dttx1\00dttx1\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dttx1] [line 46] [local] [def]
+!385 = metadata !{metadata !"0x34\00c5dssp\00c5dssp\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c5dssp] [line 45] [local] [def]
+!386 = metadata !{metadata !"0x34\00c4dssp\00c4dssp\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c4dssp] [line 45] [local] [def]
+!387 = metadata !{metadata !"0x34\00dzmax\00dzmax\00\0037\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dzmax] [line 37] [local] [def]
+!388 = metadata !{metadata !"0x34\00dymax\00dymax\00\0037\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dymax] [line 37] [local] [def]
+!389 = metadata !{metadata !"0x34\00dxmax\00dxmax\00\0037\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dxmax] [line 37] [local] [def]
+!390 = metadata !{metadata !"0x34\00dz5\00dz5\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz5] [line 34] [local] [def]
+!391 = metadata !{metadata !"0x34\00dz4\00dz4\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz4] [line 34] [local] [def]
+!392 = metadata !{metadata !"0x34\00dz3\00dz3\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz3] [line 34] [local] [def]
+!393 = metadata !{metadata !"0x34\00dz2\00dz2\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz2] [line 34] [local] [def]
+!394 = metadata !{metadata !"0x34\00dz1\00dz1\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz1] [line 34] [local] [def]
+!395 = metadata !{metadata !"0x34\00dy5\00dy5\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy5] [line 33] [local] [def]
+!396 = metadata !{metadata !"0x34\00dy4\00dy4\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy4] [line 33] [local] [def]
+!397 = metadata !{metadata !"0x34\00dy3\00dy3\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy3] [line 33] [local] [def]
+!398 = metadata !{metadata !"0x34\00dy2\00dy2\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy2] [line 33] [local] [def]
+!399 = metadata !{metadata !"0x34\00dy1\00dy1\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy1] [line 33] [local] [def]
+!400 = metadata !{metadata !"0x34\00dx5\00dx5\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx5] [line 32] [local] [def]
+!401 = metadata !{metadata !"0x34\00dx4\00dx4\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx4] [line 32] [local] [def]
+!402 = metadata !{metadata !"0x34\00dx3\00dx3\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx3] [line 32] [local] [def]
+!403 = metadata !{metadata !"0x34\00dx2\00dx2\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx2] [line 32] [local] [def]
+!404 = metadata !{metadata !"0x34\00dx1\00dx1\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx1] [line 32] [local] [def]
+!405 = metadata !{metadata !"0x34\00tz3\00tz3\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tz3] [line 31] [local] [def]
+!406 = metadata !{metadata !"0x34\00tz1\00tz1\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tz1] [line 31] [local] [def]
+!407 = metadata !{metadata !"0x34\00ty3\00ty3\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [ty3] [line 31] [local] [def]
+!408 = metadata !{metadata !"0x34\00ty1\00ty1\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [ty1] [line 31] [local] [def]
+!409 = metadata !{metadata !"0x34\00tx3\00tx3\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tx3] [line 31] [local] [def]
+!410 = metadata !{metadata !"0x34\00tx1\00tx1\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tx1] [line 31] [local] [def]
+!411 = metadata !{metadata !"0x34\00conz1\00conz1\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [conz1] [line 45] [local] [def]
+!412 = metadata !{metadata !"0x34\00c1345\00c1345\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c1345] [line 44] [local] [def]
+!413 = metadata !{metadata !"0x34\00c3c4\00c3c4\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3c4] [line 44] [local] [def]
+!414 = metadata !{metadata !"0x34\00c1c5\00c1c5\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c1c5] [line 44] [local] [def]
+!415 = metadata !{metadata !"0x34\00c1c2\00c1c2\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c1c2] [line 44] [local] [def]
+!416 = metadata !{metadata !"0x34\00c5\00c5\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c5] [line 45] [local] [def]
+!417 = metadata !{metadata !"0x34\00c4\00c4\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c4] [line 45] [local] [def]
+!418 = metadata !{metadata !"0x34\00c3\00c3\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3] [line 45] [local] [def]
+!419 = metadata !{metadata !"0x34\00lhs\00lhs\00\0069\001\001", null, metadata !300, metadata !420, null, null} ; [ DW_TAG_variable ] [lhs] [line 69] [local] [def]
+!420 = metadata !{metadata !"0x1\00\000\0020787585600\0064\000\000", null, null, metadata !20, metadata !421, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 20787585600, align 64, offset 0] [from double]
 !421 = metadata !{metadata !308, metadata !308, metadata !308, metadata !178, metadata !93, metadata !93}
-!422 = metadata !{i32 786484, i32 0, null, metadata !"q", metadata !"q", metadata !"", metadata !300, i32 73, metadata !423, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [q] [line 73] [local] [def]
-!423 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 10368, i64 64, i32 0, i32 0, metadata !20, metadata !424, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 10368, align 64, offset 0] [from double]
+!422 = metadata !{metadata !"0x34\00q\00q\00\0073\001\001", null, metadata !300, metadata !423, null, null} ; [ DW_TAG_variable ] [q] [line 73] [local] [def]
+!423 = metadata !{metadata !"0x1\00\000\0010368\0064\000\000", null, null, metadata !20, metadata !424, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 10368, align 64, offset 0] [from double]
 !424 = metadata !{metadata !425}
-!425 = metadata !{i32 786465, i64 0, i64 162}     ; [ DW_TAG_subrange_type ] [0, 161]
-!426 = metadata !{i32 786484, i32 0, null, metadata !"cuf", metadata !"cuf", metadata !"", metadata !300, i32 72, metadata !423, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [cuf] [line 72] [local] [def]
-!427 = metadata !{i32 786484, i32 0, null, metadata !"buf", metadata !"buf", metadata !"", metadata !300, i32 75, metadata !428, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [buf] [line 75] [local] [def]
-!428 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 51840, i64 64, i32 0, i32 0, metadata !20, metadata !429, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 51840, align 64, offset 0] [from double]
+!425 = metadata !{metadata !"0x21\000\00162"}     ; [ DW_TAG_subrange_type ] [0, 161]
+!426 = metadata !{metadata !"0x34\00cuf\00cuf\00\0072\001\001", null, metadata !300, metadata !423, null, null} ; [ DW_TAG_variable ] [cuf] [line 72] [local] [def]
+!427 = metadata !{metadata !"0x34\00buf\00buf\00\0075\001\001", null, metadata !300, metadata !428, null, null} ; [ DW_TAG_variable ] [buf] [line 75] [local] [def]
+!428 = metadata !{metadata !"0x1\00\000\0051840\0064\000\000", null, null, metadata !20, metadata !429, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 51840, align 64, offset 0] [from double]
 !429 = metadata !{metadata !425, metadata !93}
-!430 = metadata !{i32 786484, i32 0, null, metadata !"ue", metadata !"ue", metadata !"", metadata !300, i32 74, metadata !428, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ue] [line 74] [local] [def]
-!431 = metadata !{i32 786484, i32 0, null, metadata !"njac", metadata !"njac", metadata !"", metadata !300, i32 86, metadata !432, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [njac] [line 86] [local] [def]
-!432 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 6886684800, i64 64, i32 0, i32 0, metadata !20, metadata !433, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 6886684800, align 64, offset 0] [from double]
+!430 = metadata !{metadata !"0x34\00ue\00ue\00\0074\001\001", null, metadata !300, metadata !428, null, null} ; [ DW_TAG_variable ] [ue] [line 74] [local] [def]
+!431 = metadata !{metadata !"0x34\00njac\00njac\00\0086\001\001", null, metadata !300, metadata !432, null, null} ; [ DW_TAG_variable ] [njac] [line 86] [local] [def]
+!432 = metadata !{metadata !"0x1\00\000\006886684800\0064\000\000", null, null, metadata !20, metadata !433, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 6886684800, align 64, offset 0] [from double]
 !433 = metadata !{metadata !308, metadata !308, metadata !425, metadata !93, metadata !93}
-!434 = metadata !{i32 786484, i32 0, null, metadata !"fjac", metadata !"fjac", metadata !"", metadata !300, i32 84, metadata !432, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [fjac] [line 84] [local] [def]
-!435 = metadata !{i32 786484, i32 0, null, metadata !"tmp3", metadata !"tmp3", metadata !"", metadata !300, i32 88, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tmp3] [line 88] [local] [def]
-!436 = metadata !{i32 786484, i32 0, null, metadata !"tmp2", metadata !"tmp2", metadata !"", metadata !300, i32 88, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tmp2] [line 88] [local] [def]
-!437 = metadata !{i32 786484, i32 0, null, metadata !"tmp1", metadata !"tmp1", metadata !"", metadata !300, i32 88, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tmp1] [line 88] [local] [def]
+!434 = metadata !{metadata !"0x34\00fjac\00fjac\00\0084\001\001", null, metadata !300, metadata !432, null, null} ; [ DW_TAG_variable ] [fjac] [line 84] [local] [def]
+!435 = metadata !{metadata !"0x34\00tmp3\00tmp3\00\0088\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tmp3] [line 88] [local] [def]
+!436 = metadata !{metadata !"0x34\00tmp2\00tmp2\00\0088\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tmp2] [line 88] [local] [def]
+!437 = metadata !{metadata !"0x34\00tmp1\00tmp1\00\0088\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tmp1] [line 88] [local] [def]
 !438 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !439 = metadata !{i32 1898, i32 0, metadata !440, null}
-!440 = metadata !{i32 786443, metadata !1, metadata !114, i32 1898, i32 0, i32 107} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!440 = metadata !{metadata !"0xb\001898\000\00107", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
 !441 = metadata !{i32 1913, i32 0, metadata !442, null}
-!442 = metadata !{i32 786443, metadata !1, metadata !114, i32 1913, i32 0, i32 115} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!442 = metadata !{metadata !"0xb\001913\000\00115", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
 !443 = metadata !{i32 1923, i32 0, metadata !114, null}
 !444 = metadata !{metadata !"int", metadata !445}
 !445 = metadata !{metadata !"omnipotent char", metadata !446}
 !446 = metadata !{metadata !"Simple C/C++ TBAA"}
 !447 = metadata !{i32 1}
 !448 = metadata !{i32 1925, i32 0, metadata !449, null}
-!449 = metadata !{i32 786443, metadata !1, metadata !114, i32 1925, i32 0, i32 121} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!449 = metadata !{metadata !"0xb\001925\000\00121", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
 !450 = metadata !{i32 1939, i32 0, metadata !451, null}
-!451 = metadata !{i32 786443, metadata !1, metadata !114, i32 1939, i32 0, i32 127} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!451 = metadata !{metadata !"0xb\001939\000\00127", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
 !452 = metadata !{i32 1940, i32 0, metadata !453, null}
-!453 = metadata !{i32 786443, metadata !1, metadata !454, i32 1940, i32 0, i32 129} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!454 = metadata !{i32 786443, metadata !1, metadata !451, i32 1939, i32 0, i32 128} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!453 = metadata !{metadata !"0xb\001940\000\00129", metadata !1, metadata !454} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!454 = metadata !{metadata !"0xb\001939\000\00128", metadata !1, metadata !451} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
 !455 = metadata !{i32 1941, i32 0, metadata !456, null}
-!456 = metadata !{i32 786443, metadata !1, metadata !457, i32 1941, i32 0, i32 131} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!457 = metadata !{i32 786443, metadata !1, metadata !453, i32 1940, i32 0, i32 130} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!456 = metadata !{metadata !"0xb\001941\000\00131", metadata !1, metadata !457} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!457 = metadata !{metadata !"0xb\001940\000\00130", metadata !1, metadata !453} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
 !458 = metadata !{i32 2020, i32 0, metadata !459, null}
-!459 = metadata !{i32 786443, metadata !1, metadata !460, i32 2020, i32 0, i32 149} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!460 = metadata !{i32 786443, metadata !1, metadata !461, i32 2019, i32 0, i32 148} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!461 = metadata !{i32 786443, metadata !1, metadata !462, i32 2019, i32 0, i32 147} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!462 = metadata !{i32 786443, metadata !1, metadata !463, i32 2018, i32 0, i32 146} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!463 = metadata !{i32 786443, metadata !1, metadata !114, i32 2018, i32 0, i32 145} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!464 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!459 = metadata !{metadata !"0xb\002020\000\00149", metadata !1, metadata !460} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!460 = metadata !{metadata !"0xb\002019\000\00148", metadata !1, metadata !461} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!461 = metadata !{metadata !"0xb\002019\000\00147", metadata !1, metadata !462} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!462 = metadata !{metadata !"0xb\002018\000\00146", metadata !1, metadata !463} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!463 = metadata !{metadata !"0xb\002018\000\00145", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!464 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/PowerPC/pr18663-2.ll b/test/CodeGen/PowerPC/pr18663-2.ll
new file mode 100644
index 0000000..6b54440
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr18663-2.ll
@@ -0,0 +1,153 @@
+; RUN: llc < %s -march=ppc64 -mtriple=powerpc64-unknown-linux-gnu
+; RUN: llc < %s -march=ppc64le -mtriple=powerpc64le-unknown-linux-gnu
+
+%"class.std::__1::locale::id.1580.4307.4610.8491" = type { %"struct.std::__1::once_flag.1579.4306.4609.8490", i32 }
+%"struct.std::__1::once_flag.1579.4306.4609.8490" = type { i64 }
+%"class.Foam::IOerror.1581.4308.4611.8505" = type { %"class.Foam::error.1535.4262.4565.8504", %"class.Foam::string.1530.4257.4560.8499", i32, i32 }
+%"class.Foam::error.1535.4262.4565.8504" = type { %"class.std::exception.1523.4250.4553.8492", [36 x i8], %"class.Foam::string.1530.4257.4560.8499", %"class.Foam::string.1530.4257.4560.8499", i32, i8, i8, %"class.Foam::OStringStream.1534.4261.4564.8503"* }
+%"class.std::exception.1523.4250.4553.8492" = type { i32 (...)** }
+%"class.Foam::OStringStream.1534.4261.4564.8503" = type { %"class.Foam::OSstream.1533.4260.4563.8502" }
+%"class.Foam::OSstream.1533.4260.4563.8502" = type { [50 x i8], %"class.Foam::fileName.1531.4258.4561.8500", %"class.std::__1::basic_ostream.1532.4259.4562.8501"* }
+%"class.Foam::fileName.1531.4258.4561.8500" = type { %"class.Foam::string.1530.4257.4560.8499" }
+%"class.std::__1::basic_ostream.1532.4259.4562.8501" = type { i32 (...)**, [148 x i8] }
+%"class.Foam::string.1530.4257.4560.8499" = type { %"class.std::__1::basic_string.1529.4256.4559.8498" }
+%"class.std::__1::basic_string.1529.4256.4559.8498" = type { %"class.std::__1::__compressed_pair.1528.4255.4558.8497" }
+%"class.std::__1::__compressed_pair.1528.4255.4558.8497" = type { %"class.std::__1::__libcpp_compressed_pair_imp.1527.4254.4557.8496" }
+%"class.std::__1::__libcpp_compressed_pair_imp.1527.4254.4557.8496" = type { %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep.1526.4253.4556.8495" }
+%"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep.1526.4253.4556.8495" = type { %union.anon.1525.4252.4555.8494 }
+%union.anon.1525.4252.4555.8494 = type { %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__long.1524.4251.4554.8493" }
+%"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__long.1524.4251.4554.8493" = type { i64, i64, i8* }
+
+@.str3 = external unnamed_addr constant [16 x i8], align 1
+@_ZNSt3__15ctypeIcE2idE = external global %"class.std::__1::locale::id.1580.4307.4610.8491"
+@_ZN4Foam12FatalIOErrorE = external global %"class.Foam::IOerror.1581.4308.4611.8505"
+@.str204 = external unnamed_addr constant [18 x i8], align 1
+@.str205 = external unnamed_addr constant [34 x i8], align 1
+
+declare void @_ZN4FoamlsERNS_7OstreamEPKc() #0
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZNKSt3__18ios_base6getlocEv() #0
+
+declare void @_ZNKSt3__16locale9use_facetERNS0_2idE() #0
+
+; Function Attrs: noreturn
+declare void @_ZNKSt3__121__basic_string_commonILb1EE20__throw_length_errorEv() #1 align 2
+
+declare void @_ZN4Foam6string6expandEb() #0
+
+declare void @_ZN4Foam8IFstreamC1ERKNS_8fileNameENS_8IOstream12streamFormatENS4_13versionNumberE() #0
+
+declare void @_ZN4Foam7IOerrorclEPKcS2_iRKNS_8IOstreamE() #0
+
+declare void @_ZN4Foam7IOerror4exitEi() #0
+
+; Function Attrs: inlinehint
+declare void @_ZN4Foam8fileName12stripInvalidEv() #2 align 2
+
+define void @_ZN4Foam3CSVINS_6VectorIdEEE4readEv() #0 align 2 {
+entry:
+  invoke void @_ZN4Foam6string6expandEb()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  br i1 undef, label %if.then.i.i.i.i176, label %_ZN4Foam6stringC2ERKS0_.exit.i
+
+if.then.i.i.i.i176:                               ; preds = %invoke.cont
+  invoke void @_ZNKSt3__121__basic_string_commonILb1EE20__throw_length_errorEv()
+          to label %.noexc unwind label %lpad
+
+.noexc:                                           ; preds = %if.then.i.i.i.i176
+  unreachable
+
+_ZN4Foam6stringC2ERKS0_.exit.i:                   ; preds = %invoke.cont
+  invoke void @_ZN4Foam8fileName12stripInvalidEv()
+          to label %invoke.cont2 unwind label %lpad.i
+
+lpad.i:                                           ; preds = %_ZN4Foam6stringC2ERKS0_.exit.i
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %ehcleanup142
+
+invoke.cont2:                                     ; preds = %_ZN4Foam6stringC2ERKS0_.exit.i
+  invoke void @_ZN4Foam8IFstreamC1ERKNS_8fileNameENS_8IOstream12streamFormatENS4_13versionNumberE()
+          to label %invoke.cont4 unwind label %lpad3
+
+invoke.cont4:                                     ; preds = %invoke.cont2
+  br i1 undef, label %for.body, label %if.then
+
+if.then:                                          ; preds = %invoke.cont4
+  invoke void @_ZN4Foam7IOerrorclEPKcS2_iRKNS_8IOstreamE()
+          to label %invoke.cont8 unwind label %lpad5
+
+invoke.cont8:                                     ; preds = %if.then
+  invoke void @_ZN4FoamlsERNS_7OstreamEPKc()
+          to label %memptr.end.i unwind label %lpad5
+
+memptr.end.i:                                     ; preds = %invoke.cont8
+  invoke void @_ZN4Foam7IOerror4exitEi()
+          to label %if.end unwind label %lpad5
+
+lpad:                                             ; preds = %if.then.i.i.i.i176, %entry
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %ehcleanup142
+
+lpad3:                                            ; preds = %invoke.cont2
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %ehcleanup142
+
+lpad5:                                            ; preds = %memptr.end.i, %invoke.cont8, %if.then
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %ehcleanup142
+
+if.end:                                           ; preds = %memptr.end.i
+  br i1 undef, label %for.body, label %vector.body
+
+for.body:                                         ; preds = %if.end, %invoke.cont4
+  invoke void @_ZNKSt3__18ios_base6getlocEv()
+          to label %.noexc205 unwind label %lpad19
+
+.noexc205:                                        ; preds = %for.body
+  invoke void @_ZNKSt3__16locale9use_facetERNS0_2idE()
+          to label %invoke.cont.i.i.i unwind label %lpad.i.i.i
+
+invoke.cont.i.i.i:                                ; preds = %.noexc205
+  unreachable
+
+lpad.i.i.i:                                       ; preds = %.noexc205
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %ehcleanup142
+
+lpad19:                                           ; preds = %for.body
+  %5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %ehcleanup142
+
+vector.body:                                      ; preds = %vector.body, %if.end
+  %vec.phi = phi <8 x i32> [ %10, %vector.body ], [ undef, %if.end ]
+  %vec.phi1302 = phi <8 x i32> [ %11, %vector.body ], [ undef, %if.end ]
+  %vec.phi1303 = phi <8 x i32> [ %12, %vector.body ], [ undef, %if.end ]
+  %vec.phi1304 = phi <8 x i32> [ %13, %vector.body ], [ undef, %if.end ]
+  %6 = icmp sgt <8 x i32> undef, %vec.phi
+  %7 = icmp sgt <8 x i32> undef, %vec.phi1302
+  %8 = icmp sgt <8 x i32> undef, %vec.phi1303
+  %9 = icmp sgt <8 x i32> undef, %vec.phi1304
+  %10 = select <8 x i1> %6, <8 x i32> undef, <8 x i32> %vec.phi
+  %11 = select <8 x i1> %7, <8 x i32> undef, <8 x i32> %vec.phi1302
+  %12 = select <8 x i1> %8, <8 x i32> undef, <8 x i32> %vec.phi1303
+  %13 = select <8 x i1> %9, <8 x i32> undef, <8 x i32> %vec.phi1304
+  br label %vector.body
+
+ehcleanup142:                                     ; preds = %lpad19, %lpad.i.i.i, %lpad5, %lpad3, %lpad, %lpad.i
+  resume { i8*, i32 } undef
+}
+
+attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { inlinehint "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
diff --git a/test/CodeGen/PowerPC/pr18663.ll b/test/CodeGen/PowerPC/pr18663.ll
new file mode 100644
index 0000000..1b85223
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr18663.ll
@@ -0,0 +1,298 @@
+; RUN: llc < %s -march=ppc64 -mtriple=powerpc64-unknown-linux-gnu
+; RUN: llc < %s -march=ppc64le -mtriple=powerpc64le-unknown-linux-gnu
+
+%class.Point.1 = type { %class.Tensor.0 }
+%class.Tensor.0 = type { [3 x double] }
+%class.TriaObjectAccessor.57 = type { %class.TriaAccessor.56 }
+%class.TriaAccessor.56 = type { i32, i32, %class.Triangulation.55* }
+%class.Triangulation.55 = type { %class.Subscriptor, %"class.std::vector.46", %"class.std::vector", %"class.std::vector.3.8", [255 x %class.Boundary.50*], i32, %struct.TriaNumberCache.54 }
+%class.Subscriptor = type { i32 (...)**, i32, %"class.std::type_info.2"* }
+%"class.std::type_info.2" = type { i32 (...)**, i8* }
+%"class.std::vector.46" = type { %"struct.std::_Vector_base.45" }
+%"struct.std::_Vector_base.45" = type { %"struct.std::_Vector_base<TriangulationLevel<3> *, std::allocator<TriangulationLevel<3> *> >::_Vector_impl.44" }
+%"struct.std::_Vector_base<TriangulationLevel<3> *, std::allocator<TriangulationLevel<3> *> >::_Vector_impl.44" = type { %class.TriangulationLevel.43**, %class.TriangulationLevel.43**, %class.TriangulationLevel.43** }
+%class.TriangulationLevel.43 = type { %class.TriangulationLevel.0.37, %"struct.TriangulationLevel<3>::HexesData.42" }
+%class.TriangulationLevel.0.37 = type { %class.TriangulationLevel.1.31, %"struct.TriangulationLevel<2>::QuadsData.36" }
+%class.TriangulationLevel.1.31 = type { %class.TriangulationLevel, %"struct.TriangulationLevel<1>::LinesData.30" }
+%class.TriangulationLevel = type { %"class.std::vector.3.8", %"class.std::vector.3.8", %"class.std::vector.7.12", %"class.std::vector.12.15" }
+%"class.std::vector.7.12" = type { %"struct.std::_Vector_base" }
+%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<std::pair<int, int>, std::allocator<std::pair<int, int> > >::_Vector_impl.10" }
+%"struct.std::_Vector_base<std::pair<int, int>, std::allocator<std::pair<int, int> > >::_Vector_impl.10" = type { %"struct.std::pair.9"*, %"struct.std::pair.9"*, %"struct.std::pair.9"* }
+%"struct.std::pair.9" = type opaque
+%"class.std::vector.12.15" = type { %"struct.std::_Vector_base.13.14" }
+%"struct.std::_Vector_base.13.14" = type { %"struct.std::_Vector_base<unsigned int, std::allocator<unsigned int> >::_Vector_impl.13" }
+%"struct.std::_Vector_base<unsigned int, std::allocator<unsigned int> >::_Vector_impl.13" = type { i32*, i32*, i32* }
+%"struct.TriangulationLevel<1>::LinesData.30" = type { %"class.std::vector.17.20", %"class.std::vector.22.23", %"class.std::vector.3.8", %"class.std::vector.3.8", %"class.std::vector.27.26", %"class.std::vector.32.29" }
+%"class.std::vector.17.20" = type { %"struct.std::_Vector_base.18.19" }
+%"struct.std::_Vector_base.18.19" = type { %"struct.std::_Vector_base<Line, std::allocator<Line> >::_Vector_impl.18" }
+%"struct.std::_Vector_base<Line, std::allocator<Line> >::_Vector_impl.18" = type { %class.Line.17*, %class.Line.17*, %class.Line.17* }
+%class.Line.17 = type { [2 x i32] }
+%"class.std::vector.22.23" = type { %"struct.std::_Vector_base.23.22" }
+%"struct.std::_Vector_base.23.22" = type { %"struct.std::_Vector_base<int, std::allocator<int> >::_Vector_impl.21" }
+%"struct.std::_Vector_base<int, std::allocator<int> >::_Vector_impl.21" = type { i32*, i32*, i32* }
+%"class.std::vector.27.26" = type { %"struct.std::_Vector_base.28.25" }
+%"struct.std::_Vector_base.28.25" = type { %"struct.std::_Vector_base<unsigned char, std::allocator<unsigned char> >::_Vector_impl.24" }
+%"struct.std::_Vector_base<unsigned char, std::allocator<unsigned char> >::_Vector_impl.24" = type { i8*, i8*, i8* }
+%"class.std::vector.32.29" = type { %"struct.std::_Vector_base.33.28" }
+%"struct.std::_Vector_base.33.28" = type { %"struct.std::_Vector_base<void *, std::allocator<void *> >::_Vector_impl.27" }
+%"struct.std::_Vector_base<void *, std::allocator<void *> >::_Vector_impl.27" = type { i8**, i8**, i8** }
+%"struct.TriangulationLevel<2>::QuadsData.36" = type { %"class.std::vector.37.35", %"class.std::vector.22.23", %"class.std::vector.3.8", %"class.std::vector.3.8", %"class.std::vector.27.26", %"class.std::vector.32.29" }
+%"class.std::vector.37.35" = type { %"struct.std::_Vector_base.38.34" }
+%"struct.std::_Vector_base.38.34" = type { %"struct.std::_Vector_base<Quad, std::allocator<Quad> >::_Vector_impl.33" }
+%"struct.std::_Vector_base<Quad, std::allocator<Quad> >::_Vector_impl.33" = type { %class.Quad.32*, %class.Quad.32*, %class.Quad.32* }
+%class.Quad.32 = type { [4 x i32] }
+%"struct.TriangulationLevel<3>::HexesData.42" = type { %"class.std::vector.42.41", %"class.std::vector.22.23", %"class.std::vector.3.8", %"class.std::vector.3.8", %"class.std::vector.27.26", %"class.std::vector.32.29", %"class.std::vector.3.8" }
+%"class.std::vector.42.41" = type { %"struct.std::_Vector_base.43.40" }
+%"struct.std::_Vector_base.43.40" = type { %"struct.std::_Vector_base<Hexahedron, std::allocator<Hexahedron> >::_Vector_impl.39" }
+%"struct.std::_Vector_base<Hexahedron, std::allocator<Hexahedron> >::_Vector_impl.39" = type { %class.Hexahedron.38*, %class.Hexahedron.38*, %class.Hexahedron.38* }
+%class.Hexahedron.38= type { [6 x i32] }
+%"class.std::vector" = type { %"struct.std::_Vector_base.48.48" }
+%"struct.std::_Vector_base.48.48" = type { %"struct.std::_Vector_base<Point<3>, std::allocator<Point<3> > >::_Vector_impl.47" }
+%"struct.std::_Vector_base<Point<3>, std::allocator<Point<3> > >::_Vector_impl.47" = type { %class.Point.1*, %class.Point.1*, %class.Point.1* }
+%"class.std::vector.3.8" = type { %"struct.std::_Bvector_base.7" }
+%"struct.std::_Bvector_base.7" = type { %"struct.std::_Bvector_base<std::allocator<bool> >::_Bvector_impl.6" }
+%"struct.std::_Bvector_base<std::allocator<bool> >::_Bvector_impl.6" = type { %"struct.std::_Bit_iterator.5", %"struct.std::_Bit_iterator.5", i64* }
+%"struct.std::_Bit_iterator.5" = type { %"struct.std::_Bit_iterator_base.base.4", [4 x i8] }
+%"struct.std::_Bit_iterator_base.base.4" = type <{ i64*, i32 }>
+%class.Boundary.50 = type opaque
+%struct.TriaNumberCache.54 = type { %struct.TriaNumberCache.52.52, i32, %"class.std::vector.12.15", i32, %"class.std::vector.12.15" }
+%struct.TriaNumberCache.52.52 = type { %struct.TriaNumberCache.53.51, i32, %"class.std::vector.12.15", i32, %"class.std::vector.12.15" }
+%struct.TriaNumberCache.53.51 = type { i32, %"class.std::vector.12.15", i32, %"class.std::vector.12.15" }
+
+define void @_ZNK18TriaObjectAccessorILi3ELi3EE10barycenterEv(%class.Point.1* noalias nocapture sret %agg.result, %class.TriaObjectAccessor.57* %this) #0 align 2 {
+entry:
+  %0 = load double* null, align 8
+  %1 = load double* undef, align 8
+  %call18 = tail call dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57* %this, i32 zeroext 6)
+  %2 = load double* undef, align 8
+  %call21 = tail call dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57* %this, i32 zeroext 7)
+  %3 = load double* undef, align 8
+  %call33 = tail call dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57* %this, i32 zeroext 3)
+  %4 = load double* null, align 8
+  %5 = load double* undef, align 8
+  %call45 = tail call dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57* %this, i32 zeroext 7)
+  %6 = load double* undef, align 8
+  %call48 = tail call dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57* %this, i32 zeroext 0)
+  %7 = load double* undef, align 8
+  %call66 = tail call dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57* %this, i32 zeroext 6)
+  %8 = load double* undef, align 8
+  %mul334 = fmul double undef, 2.000000e+00
+  %mul579 = fmul double %2, %5
+  %mul597 = fmul double undef, %mul579
+  %mul679 = fmul double %2, %8
+  %mul1307 = fmul double undef, %1
+  %mul2092 = fmul double undef, %4
+  %mul2679 = fmul double undef, undef
+  %mul2931 = fmul double undef, %3
+  %mul3094 = fmul double undef, %3
+  %mul3096 = fmul double %mul3094, %8
+  %sub3097 = fsub double 0.000000e+00, %mul3096
+  %add3105 = fadd double undef, %sub3097
+  %add3113 = fadd double 0.000000e+00, %add3105
+  %sub3121 = fsub double %add3113, undef
+  %sub3129 = fsub double %sub3121, undef
+  %add3137 = fadd double undef, %sub3129
+  %add3145 = fadd double undef, %add3137
+  %sub3153 = fsub double %add3145, undef
+  %sub3162 = fsub double %sub3153, 0.000000e+00
+  %add3171 = fadd double undef, %sub3162
+  %add3180 = fadd double undef, %add3171
+  %add3189 = fadd double 0.000000e+00, %add3180
+  %mul3197 = fmul double %4, %mul2679
+  %sub3198 = fsub double %add3189, %mul3197
+  %sub3207 = fsub double %sub3198, 0.000000e+00
+  %mul3212 = fmul double %2, undef
+  %mul3214 = fmul double %mul3212, undef
+  %sub3215 = fsub double %sub3207, %mul3214
+  %mul3222 = fmul double %5, 0.000000e+00
+  %sub3223 = fsub double %sub3215, %mul3222
+  %mul3228 = fmul double %2, undef
+  %mul3230 = fmul double %3, %mul3228
+  %add3231 = fadd double %mul3230, %sub3223
+  %mul3236 = fmul double undef, undef
+  %mul3238 = fmul double %mul3236, %8
+  %add3239 = fadd double %mul3238, %add3231
+  %mul3244 = fmul double %mul1307, %3
+  %mul3246 = fmul double %mul3244, %7
+  %sub3247 = fsub double %add3239, %mul3246
+  %mul3252 = fmul double undef, undef
+  %mul3254 = fmul double %mul3252, %7
+  %add3255 = fadd double %mul3254, %sub3247
+  %sub3263 = fsub double %add3255, undef
+  %add3271 = fadd double 0.000000e+00, %sub3263
+  %sub3279 = fsub double %add3271, undef
+  %sub3287 = fsub double %sub3279, undef
+  %mul3292 = fmul double %mul1307, %5
+  %mul3294 = fmul double %mul3292, undef
+  %add3295 = fadd double %mul3294, %sub3287
+  %add3303 = fadd double undef, %add3295
+  %add3311 = fadd double 0.000000e+00, %add3303
+  %mul3318 = fmul double undef, %7
+  %sub3319 = fsub double %add3311, %mul3318
+  %mul3326 = fmul double %4, %mul3228
+  %sub3327 = fsub double %sub3319, %mul3326
+  %mul3334 = fmul double undef, %8
+  %sub3335 = fsub double %sub3327, %mul3334
+  %add3343 = fadd double undef, %sub3335
+  %mul3350 = fmul double %mul3212, %7
+  %add3351 = fadd double %mul3350, %add3343
+  %mul3358 = fmul double %mul2092, undef
+  %sub3359 = fsub double %add3351, %mul3358
+  %mul3362 = fmul double undef, %1
+  %mul3366 = fmul double 0.000000e+00, %8
+  %add3367 = fadd double %mul3366, %sub3359
+  %mul3372 = fmul double %mul3362, %5
+  %sub3375 = fsub double %add3367, undef
+  %add3383 = fadd double undef, %sub3375
+  %mul3389 = fmul double %2, 0.000000e+00
+  %mul3391 = fmul double %4, %mul3389
+  %sub3392 = fsub double %add3383, %mul3391
+  %mul3396 = fmul double undef, 0.000000e+00
+  %mul3400 = fmul double undef, %7
+  %sub3401 = fsub double %sub3392, %mul3400
+  %mul3407 = fmul double %mul3396, %4
+  %mul3409 = fmul double %mul3407, %8
+  %add3410 = fadd double %mul3409, %sub3401
+  %add3419 = fadd double undef, %add3410
+  %mul3423 = fmul double undef, %mul334
+  %add3428 = fadd double undef, %add3419
+  %add3437 = fadd double undef, %add3428
+  %mul3443 = fmul double %mul3423, %3
+  %mul3445 = fmul double %mul3443, %8
+  %sub3446 = fsub double %add3437, %mul3445
+  %mul3453 = fmul double %mul3372, undef
+  %add3454 = fadd double %mul3453, %sub3446
+  %add3462 = fadd double 0.000000e+00, %add3454
+  %mul3467 = fmul double %mul3362, %3
+  %mul3469 = fmul double %mul3467, %8
+  %sub3470 = fsub double %add3462, %mul3469
+  %add3478 = fadd double 0.000000e+00, %sub3470
+  %sub3486 = fsub double %add3478, undef
+  %mul3490 = fmul double %mul334, 0.000000e+00
+  %mul3492 = fmul double %2, %mul3490
+  %mul3494 = fmul double %mul3492, undef
+  %sub3495 = fsub double %sub3486, %mul3494
+  %sub3503 = fsub double %sub3495, undef
+  %sub3512 = fsub double %sub3503, undef
+  %add3520 = fadd double undef, %sub3512
+  %sub3528 = fsub double %add3520, undef
+  %add3537 = fadd double undef, %sub3528
+  %add3545 = fadd double 0.000000e+00, %add3537
+  %sub3553 = fsub double %add3545, undef
+  %add3561 = fadd double undef, %sub3553
+  %sub3569 = fsub double %add3561, undef
+  %mul3574 = fmul double undef, undef
+  %mul3576 = fmul double %mul3574, %7
+  %add3577 = fadd double %mul3576, %sub3569
+  %sub3585 = fsub double %add3577, undef
+  %mul3592 = fmul double %4, undef
+  %sub3593 = fsub double %sub3585, %mul3592
+  %mul3598 = fmul double %2, undef
+  %mul3600 = fmul double %mul3598, %7
+  %add3601 = fadd double %mul3600, %sub3593
+  %mul3608 = fmul double %mul3598, undef
+  %sub3609 = fsub double %add3601, %mul3608
+  %sub3618 = fsub double %sub3609, undef
+  %add3627 = fadd double undef, %sub3618
+  %add3635 = fadd double undef, %add3627
+  %mul3638 = fmul double undef, %2
+  %mul3640 = fmul double %mul3638, %5
+  %mul3642 = fmul double %mul3640, %7
+  %sub3643 = fsub double %add3635, %mul3642
+  %mul3648 = fmul double %1, undef
+  %mul3650 = fmul double %mul3648, %8
+  %sub3651 = fsub double %sub3643, %mul3650
+  %mul3656 = fmul double %mul3638, %4
+  %mul3658 = fmul double %mul3656, %8
+  %add3659 = fadd double %mul3658, %sub3651
+  %mul3666 = fmul double %5, 0.000000e+00
+  %add3667 = fadd double %mul3666, %add3659
+  %sub3675 = fsub double %add3667, undef
+  %mul3680 = fmul double %mul3638, %3
+  %mul3682 = fmul double %mul3680, %8
+  %sub3683 = fsub double %sub3675, %mul3682
+  %add3692 = fadd double 0.000000e+00, %sub3683
+  %mul3696 = fmul double undef, undef
+  %mul3698 = fmul double %mul3696, %4
+  %mul3700 = fmul double %mul3698, %8
+  %add3701 = fadd double %mul3700, %add3692
+  %sub3710 = fsub double %add3701, undef
+  %mul3716 = fmul double undef, %3
+  %mul3718 = fmul double %mul3716, %8
+  %sub3719 = fsub double %sub3710, %mul3718
+  %add3727 = fadd double undef, %sub3719
+  %mul3734 = fmul double %mul3574, %8
+  %add3735 = fadd double %mul3734, %add3727
+  %sub3743 = fsub double %add3735, 0.000000e+00
+  %add3751 = fadd double 0.000000e+00, %sub3743
+  %mul3758 = fmul double %6, 0.000000e+00
+  %sub3759 = fsub double %add3751, %mul3758
+  %mul3764 = fmul double undef, %mul2931
+  %mul3766 = fmul double %mul3764, undef
+  %sub3767 = fsub double %sub3759, %mul3766
+  %add3775 = fadd double 0.000000e+00, %sub3767
+  %add3783 = fadd double undef, %add3775
+  %sub3791 = fsub double %add3783, 0.000000e+00
+  %add3799 = fadd double undef, %sub3791
+  %sub3807 = fsub double %add3799, undef
+  %mul3814 = fmul double 0.000000e+00, undef
+  %add3815 = fadd double %mul3814, %sub3807
+  %mul3822 = fmul double %mul597, undef
+  %sub3823 = fsub double %add3815, %mul3822
+  %add3831 = fadd double undef, %sub3823
+  %mul3836 = fmul double undef, %mul679
+  %mul3838 = fmul double %6, %mul3836
+  %sub3839 = fsub double %add3831, %mul3838
+  %add3847 = fadd double undef, %sub3839
+  %add3855 = fadd double undef, %add3847
+  %mul3858 = fmul double undef, %8
+  %mul3860 = fmul double undef, %mul3858
+  %mul3862 = fmul double %6, %mul3860
+  %sub3863 = fsub double %add3855, %mul3862
+  %add3872 = fadd double undef, %sub3863
+  %sub3880 = fsub double %add3872, undef
+  %sub3889 = fsub double %sub3880, undef
+  %sub3898 = fsub double %sub3889, undef
+  %add3907 = fadd double undef, %sub3898
+  %sub3915 = fsub double %add3907, 0.000000e+00
+  %add3923 = fadd double undef, %sub3915
+  %mul3930 = fmul double %3, undef
+  %add3931 = fadd double %mul3930, %add3923
+  %add3940 = fadd double undef, %add3931
+  %sub3949 = fsub double %add3940, undef
+  %mul3952 = fmul double %2, %3
+  %sub3957 = fsub double %sub3949, undef
+  %sub3966 = fsub double %sub3957, undef
+  %add3975 = fadd double undef, %sub3966
+  %add3983 = fadd double undef, %add3975
+  %sub3992 = fsub double %add3983, undef
+  %mul3997 = fmul double undef, %mul3952
+  %mul3999 = fmul double %mul3997, %8
+  %add4000 = fadd double %mul3999, %sub3992
+  %sub4008 = fsub double %add4000, undef
+  %add4017 = fadd double undef, %sub4008
+  %add4026 = fadd double 0.000000e+00, %add4017
+  %mul4034 = fmul double %6, undef
+  %sub4035 = fsub double %add4026, %mul4034
+  %add4043 = fadd double undef, %sub4035
+  %sub4051 = fsub double %add4043, 0.000000e+00
+  %mul4916 = fmul double 0.000000e+00, %sub4051
+  %mul4917 = fmul double %mul4916, 0x3FC5555555555555
+  %mul7317 = fmul double 0.000000e+00, %3
+  %mul7670 = fmul double %0, %mul7317
+  %mul8882 = fmul double %0, 0.000000e+00
+  %mul8884 = fmul double undef, %mul8882
+  %sub8885 = fsub double 0.000000e+00, %mul8884
+  %mul8892 = fmul double %mul7670, undef
+  %add8893 = fadd double %mul8892, %sub8885
+  %mul8900 = fmul double undef, undef
+  %add8901 = fadd double %mul8900, %add8893
+  %mul9767 = fmul double 0.000000e+00, %add8901
+  %mul9768 = fmul double %mul9767, 0x3FC5555555555555
+  store double %mul4917, double* undef, align 8
+  store double %mul9768, double* undef, align 8
+  ret void
+}
+
+declare dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57*, i32 zeroext) #0
+
diff --git a/test/CodeGen/PowerPC/pr20442.ll b/test/CodeGen/PowerPC/pr20442.ll
new file mode 100644
index 0000000..ad43a04
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr20442.ll
@@ -0,0 +1,79 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-linux-gnu"
+
+; This code would cause code generation like this after PPCCTRLoops ran:
+;  %indvar = phi i32 [ 0, %for.body ], [ %indvar.next, %if.then6 ]
+;  %j.1.ph13 = phi i32 [ %j.110, %if.then6 ], [ 0, %for.body ], [ 0, %for.body ]
+;  %c.0.ph12 = phi i32 [ %dec, %if.then6 ], [ %2, %for.body ], [ %2, %for.body ]
+; which would fail verification because the created induction variable does not
+; have as many predecessor entries as the other PHIs.
+; CHECK-LABEL: @fn1
+; CHECK: mtctr
+
+%struct.anon = type { i32 }
+%struct.anon.0 = type { i32 }
+
+@b = common global %struct.anon* null, align 4
+@a = common global %struct.anon.0* null, align 4
+
+; Function Attrs: nounwind readonly uwtable
+define i32 @fn1() #0 {
+entry:
+  %0 = load %struct.anon** @b, align 4
+  %1 = ptrtoint %struct.anon* %0 to i32
+  %cmp = icmp sgt %struct.anon* %0, null
+  %2 = load %struct.anon.0** @a, align 4
+  br i1 %cmp, label %for.bodythread-pre-split, label %if.end8
+
+for.bodythread-pre-split:                         ; preds = %entry
+  %aclass = getelementptr inbounds %struct.anon.0* %2, i32 0, i32 0
+  %.pr = load i32* %aclass, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.bodythread-pre-split, %for.body
+  switch i32 %.pr, label %for.body [
+    i32 0, label %while.body.lr.ph.preheader
+    i32 2, label %while.body.lr.ph.preheader
+  ]
+
+while.body.lr.ph.preheader:                       ; preds = %for.body, %for.body
+  br label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %while.body.lr.ph.preheader, %if.then6
+  %j.1.ph13 = phi i32 [ %j.110.lcssa, %if.then6 ], [ 0, %while.body.lr.ph.preheader ]
+  %c.0.ph12 = phi i32 [ %dec, %if.then6 ], [ %1, %while.body.lr.ph.preheader ]
+  br label %while.body
+
+while.cond:                                       ; preds = %while.body
+  %cmp2 = icmp slt i32 %inc7, %c.0.ph12
+  br i1 %cmp2, label %while.body, label %if.end8.loopexit
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.cond
+  %j.110 = phi i32 [ %j.1.ph13, %while.body.lr.ph ], [ %inc7, %while.cond ]
+  %aclass_index = getelementptr inbounds %struct.anon* %0, i32 %j.110, i32 0
+  %3 = load i32* %aclass_index, align 4
+  %aclass5 = getelementptr inbounds %struct.anon.0* %2, i32 %3, i32 0
+  %4 = load i32* %aclass5, align 4
+  %tobool = icmp eq i32 %4, 0
+  %inc7 = add nsw i32 %j.110, 1
+  br i1 %tobool, label %while.cond, label %if.then6
+
+if.then6:                                         ; preds = %while.body
+  %j.110.lcssa = phi i32 [ %j.110, %while.body ]
+  %dec = add nsw i32 %c.0.ph12, -1
+  %cmp29 = icmp slt i32 %j.110.lcssa, %dec
+  br i1 %cmp29, label %while.body.lr.ph, label %if.end8.loopexit17
+
+if.end8.loopexit:                                 ; preds = %while.cond
+  br label %if.end8
+
+if.end8.loopexit17:                               ; preds = %if.then6
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.end8.loopexit17, %if.end8.loopexit, %entry
+  ret i32 undef
+}
+
+attributes #0 = { nounwind readonly uwtable }
+
diff --git a/test/CodeGen/PowerPC/recipest.ll b/test/CodeGen/PowerPC/recipest.ll
index 891e801..cd77548 100644
--- a/test/CodeGen/PowerPC/recipest.ll
+++ b/test/CodeGen/PowerPC/recipest.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck -check-prefix=CHECK-SAFE %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck -check-prefix=CHECK-SAFE %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -8,7 +8,6 @@ declare float @llvm.sqrt.f32(float)
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
 
 define double @foo(double %a, double %b) nounwind {
-entry:
   %x = call double @llvm.sqrt.f64(double %b)
   %r = fdiv double %a, %x
   ret double %r
@@ -17,12 +16,12 @@ entry:
 ; CHECK-DAG: frsqrte
 ; CHECK-DAG: fnmsub
 ; CHECK: fmul
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: fmul
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: fmul
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmul
 ; CHECK: blr
 
 ; CHECK-SAFE: @foo
@@ -32,7 +31,6 @@ entry:
 }
 
 define double @foof(double %a, float %b) nounwind {
-entry:
   %x = call float @llvm.sqrt.f32(float %b)
   %y = fpext float %x to double
   %r = fdiv double %a, %y
@@ -42,10 +40,10 @@ entry:
 ; CHECK-DAG: frsqrtes
 ; CHECK-DAG: fnmsubs
 ; CHECK: fmuls
-; CHECK: fmadds
-; CHECK: fmuls
-; CHECK: fmul
-; CHECK: blr
+; CHECK-NEXT: fmadds
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: fmul
+; CHECK-NEXT: blr
 
 ; CHECK-SAFE: @foof
 ; CHECK-SAFE: fsqrts
@@ -54,7 +52,6 @@ entry:
 }
 
 define float @food(float %a, double %b) nounwind {
-entry:
   %x = call double @llvm.sqrt.f64(double %b)
   %y = fptrunc double %x to float
   %r = fdiv float %a, %y
@@ -64,14 +61,14 @@ entry:
 ; CHECK-DAG: frsqrte
 ; CHECK-DAG: fnmsub
 ; CHECK: fmul
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: fmul
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: frsp
-; CHECK: fmuls
-; CHECK: blr
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsp
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: blr
 
 ; CHECK-SAFE: @foo
 ; CHECK-SAFE: fsqrt
@@ -80,7 +77,6 @@ entry:
 }
 
 define float @goo(float %a, float %b) nounwind {
-entry:
   %x = call float @llvm.sqrt.f32(float %b)
   %r = fdiv float %a, %x
   ret float %r
@@ -89,10 +85,10 @@ entry:
 ; CHECK-DAG: frsqrtes
 ; CHECK-DAG: fnmsubs
 ; CHECK: fmuls
-; CHECK: fmadds
-; CHECK: fmuls
-; CHECK: fmuls
-; CHECK: blr
+; CHECK-NEXT: fmadds
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: blr
 
 ; CHECK-SAFE: @goo
 ; CHECK-SAFE: fsqrts
@@ -100,8 +96,35 @@ entry:
 ; CHECK-SAFE: blr
 }
 
+; Recognize that this is rsqrt(a) * rcp(b) * c, 
+; not 1 / ( 1 / sqrt(a)) * rcp(b) * c.
+define float @rsqrt_fmul(float %a, float %b, float %c) {
+  %x = call float @llvm.sqrt.f32(float %a)
+  %y = fmul float %x, %b 
+  %z = fdiv float %c, %y
+  ret float %z
+
+; CHECK: @rsqrt_fmul
+; CHECK-DAG: frsqrtes
+; CHECK-DAG: fres
+; CHECK-DAG: fnmsubs
+; CHECK-DAG: fmuls
+; CHECK-DAG: fnmsubs
+; CHECK-DAG: fmadds
+; CHECK-DAG: fmadds
+; CHECK: fmuls
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: blr
+
+; CHECK-SAFE: @rsqrt_fmul
+; CHECK-SAFE: fsqrts
+; CHECK-SAFE: fmuls
+; CHECK-SAFE: fdivs
+; CHECK-SAFE: blr
+}
+
 define <4 x float> @hoo(<4 x float> %a, <4 x float> %b) nounwind {
-entry:
   %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b)
   %r = fdiv <4 x float> %a, %x
   ret <4 x float> %r
@@ -115,7 +138,6 @@ entry:
 }
 
 define double @foo2(double %a, double %b) nounwind {
-entry:
   %r = fdiv double %a, %b
   ret double %r
 
@@ -123,10 +145,10 @@ entry:
 ; CHECK-DAG: fre
 ; CHECK-DAG: fnmsub
 ; CHECK: fmadd
-; CHECK: fnmsub
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: blr
+; CHECK-NEXT: fnmsub
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: blr
 
 ; CHECK-SAFE: @foo2
 ; CHECK-SAFE: fdiv
@@ -134,7 +156,6 @@ entry:
 }
 
 define float @goo2(float %a, float %b) nounwind {
-entry:
   %r = fdiv float %a, %b
   ret float %r
 
@@ -142,8 +163,8 @@ entry:
 ; CHECK-DAG: fres
 ; CHECK-DAG: fnmsubs
 ; CHECK: fmadds
-; CHECK: fmuls
-; CHECK: blr
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: blr
 
 ; CHECK-SAFE: @goo2
 ; CHECK-SAFE: fdivs
@@ -151,7 +172,6 @@ entry:
 }
 
 define <4 x float> @hoo2(<4 x float> %a, <4 x float> %b) nounwind {
-entry:
   %r = fdiv <4 x float> %a, %b
   ret <4 x float> %r
 
@@ -164,7 +184,6 @@ entry:
 }
 
 define double @foo3(double %a) nounwind {
-entry:
   %r = call double @llvm.sqrt.f64(double %a)
   ret double %r
 
@@ -173,16 +192,12 @@ entry:
 ; CHECK-DAG: frsqrte
 ; CHECK-DAG: fnmsub
 ; CHECK: fmul
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: fmul
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: fre
-; CHECK: fnmsub
-; CHECK: fmadd
-; CHECK: fnmsub
-; CHECK: fmadd
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmul
 ; CHECK: blr
 
 ; CHECK-SAFE: @foo3
@@ -191,7 +206,6 @@ entry:
 }
 
 define float @goo3(float %a) nounwind {
-entry:
   %r = call float @llvm.sqrt.f32(float %a)
   ret float %r
 
@@ -200,11 +214,9 @@ entry:
 ; CHECK-DAG: frsqrtes
 ; CHECK-DAG: fnmsubs
 ; CHECK: fmuls
-; CHECK: fmadds
-; CHECK: fmuls
-; CHECK: fres
-; CHECK: fnmsubs
-; CHECK: fmadds
+; CHECK-NEXT: fmadds
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: fmuls
 ; CHECK: blr
 
 ; CHECK-SAFE: @goo3
@@ -213,13 +225,11 @@ entry:
 }
 
 define <4 x float> @hoo3(<4 x float> %a) nounwind {
-entry:
   %r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
   ret <4 x float> %r
 
 ; CHECK: @hoo3
 ; CHECK: vrsqrtefp
-; CHECK-DAG: vrefp
 ; CHECK-DAG: vcmpeqfp
 
 ; CHECK-SAFE: @hoo3
diff --git a/test/CodeGen/PowerPC/resolvefi-disp.ll b/test/CodeGen/PowerPC/resolvefi-disp.ll
new file mode 100644
index 0000000..ca42bcd
--- /dev/null
+++ b/test/CodeGen/PowerPC/resolvefi-disp.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -print-after=localstackalloc <%s >%t 2>&1 && FileCheck <%t %s
+
+; Due to a bug in isFrameOffsetLegal we ended up with resolveFrameIndex creating
+; addresses with out-of-range displacements.  Verify that this no longer happens.
+; CHECK-NOT: LD {{3276[8-9]}}
+; CHECK-NOT: LD {{327[7-9][0-9]}}
+; CHECK-NOT: LD {{32[8-9][0-9][0-9]}}
+; CHECK-NOT: LD {{3[3-9][0-9][0-9][0-9]}}
+; CHECK-NOT: LD {{[4-9][0-9][0-9][0-9][0-9]}}
+; CHECK-NOT: LD {{[1-9][0-9][0-9][0-9][0-9][0-9]+}}
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%struct.S2760 = type { <2 x float>, %struct.anon, i32, [28 x i8] }
+%struct.anon = type { [11 x %struct.anon.0], i64, [6 x { i64, i64 }], [24 x i8] }
+%struct.anon.0 = type { [30 x %union.U4DI], i8, [0 x i16], [30 x i8] }
+%union.U4DI = type { <4 x i64> }
+
+@s2760 = external global %struct.S2760
+@fails = external global i32
+
+define void @check2760(%struct.S2760* noalias sret %agg.result, %struct.S2760* byval align 16, %struct.S2760* %arg1, %struct.S2760* byval align 16) {
+entry:
+  %arg0 = alloca %struct.S2760, align 32
+  %arg2 = alloca %struct.S2760, align 32
+  %arg1.addr = alloca %struct.S2760*, align 8
+  %ret = alloca %struct.S2760, align 32
+  %b1 = alloca %struct.S2760, align 32
+  %b2 = alloca %struct.S2760, align 32
+  %2 = bitcast %struct.S2760* %arg0 to i8*
+  %3 = bitcast %struct.S2760* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 11104, i32 16, i1 false)
+  %4 = bitcast %struct.S2760* %arg2 to i8*
+  %5 = bitcast %struct.S2760* %1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 11104, i32 16, i1 false)
+  store %struct.S2760* %arg1, %struct.S2760** %arg1.addr, align 8
+  %6 = bitcast %struct.S2760* %ret to i8*
+  call void @llvm.memset.p0i8.i64(i8* %6, i8 0, i64 11104, i32 32, i1 false)
+  %7 = bitcast %struct.S2760* %b1 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %7, i8 0, i64 11104, i32 32, i1 false)
+  %8 = bitcast %struct.S2760* %b2 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %8, i8 0, i64 11104, i32 32, i1 false)
+  %b = getelementptr inbounds %struct.S2760* %arg0, i32 0, i32 1
+  %g = getelementptr inbounds %struct.anon* %b, i32 0, i32 1
+  %9 = load i64* %g, align 8
+  %10 = load i64* getelementptr inbounds (%struct.S2760* @s2760, i32 0, i32 1, i32 1), align 8
+  %cmp = icmp ne i64 %9, %10
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %11 = load i32* @fails, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* @fails, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %12 = load i64* getelementptr inbounds (%struct.S2760* @s2760, i32 0, i32 1, i32 1), align 8
+  %b3 = getelementptr inbounds %struct.S2760* %ret, i32 0, i32 1
+  %g4 = getelementptr inbounds %struct.anon* %b3, i32 0, i32 1
+  store i64 %12, i64* %g4, align 8
+  %13 = bitcast %struct.S2760* %agg.result to i8*
+  %14 = bitcast %struct.S2760* %ret to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %13, i8* %14, i64 11104, i32 32, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
diff --git a/test/CodeGen/PowerPC/rounding-ops.ll b/test/CodeGen/PowerPC/rounding-ops.ll
index bf0a641..42f1236 100644
--- a/test/CodeGen/PowerPC/rounding-ops.ll
+++ b/test/CodeGen/PowerPC/rounding-ops.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -8,6 +9,8 @@ define float @test1(float %x) nounwind  {
 
 ; CHECK-LABEL: test1:
 ; CHECK: frim 1, 1
+; CHECK-VSX-LABEL: test1:
+; CHECK-VSX: frim 1, 1
 }
 
 declare float @floorf(float) nounwind readnone
@@ -18,6 +21,8 @@ define double @test2(double %x) nounwind  {
 
 ; CHECK-LABEL: test2:
 ; CHECK: frim 1, 1
+; CHECK-VSX-LABEL: test2:
+; CHECK-VSX: xsrdpim 1, 1
 }
 
 declare double @floor(double) nounwind readnone
@@ -28,6 +33,8 @@ define float @test3(float %x) nounwind  {
 
 ; CHECK-LABEL: test3:
 ; CHECK: frin 1, 1
+; CHECK-VSX-LABEL: test3:
+; CHECK-VSX: frin 1, 1
 }
 
 declare float @roundf(float) nounwind readnone
@@ -38,6 +45,8 @@ define double @test4(double %x) nounwind  {
 
 ; CHECK-LABEL: test4:
 ; CHECK: frin 1, 1
+; CHECK-VSX-LABEL: test4:
+; CHECK-VSX: xsrdpi 1, 1
 }
 
 declare double @round(double) nounwind readnone
@@ -48,6 +57,8 @@ define float @test5(float %x) nounwind  {
 
 ; CHECK-LABEL: test5:
 ; CHECK: frip 1, 1
+; CHECK-VSX-LABEL: test5:
+; CHECK-VSX: frip 1, 1
 }
 
 declare float @ceilf(float) nounwind readnone
@@ -58,6 +69,8 @@ define double @test6(double %x) nounwind  {
 
 ; CHECK-LABEL: test6:
 ; CHECK: frip 1, 1
+; CHECK-VSX-LABEL: test6:
+; CHECK-VSX: xsrdpip 1, 1
 }
 
 declare double @ceil(double) nounwind readnone
@@ -68,6 +81,8 @@ define float @test9(float %x) nounwind  {
 
 ; CHECK-LABEL: test9:
 ; CHECK: friz 1, 1
+; CHECK-VSX-LABEL: test9:
+; CHECK-VSX: friz 1, 1
 }
 
 declare float @truncf(float) nounwind readnone
@@ -78,6 +93,8 @@ define double @test10(double %x) nounwind  {
 
 ; CHECK-LABEL: test10:
 ; CHECK: friz 1, 1
+; CHECK-VSX-LABEL: test10:
+; CHECK-VSX: xsrdpiz 1, 1
 }
 
 declare double @trunc(double) nounwind readnone
diff --git a/test/CodeGen/PowerPC/sections.ll b/test/CodeGen/PowerPC/sections.ll
index 0ff4a89..37a8d16 100644
--- a/test/CodeGen/PowerPC/sections.ll
+++ b/test/CodeGen/PowerPC/sections.ll
@@ -5,4 +5,3 @@
 
 ; CHECK:  .section  .bss,"aw",@nobits
 ; CHECK:  .globl A
-
diff --git a/test/CodeGen/PowerPC/split-index-tc.ll b/test/CodeGen/PowerPC/split-index-tc.ll
new file mode 100644
index 0000000..03aff24
--- /dev/null
+++ b/test/CodeGen/PowerPC/split-index-tc.ll
@@ -0,0 +1,82 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%"class.llvm::MachineOperand" = type { i8, [3 x i8], i64, i64*, i64 }
+
+; Function Attrs: nounwind
+define void @_ZN4llvm17ScheduleDAGInstrs14addPhysRegDepsEPNS_5SUnitEj() #0 align 2 {
+
+; If we were able to split out the indexing, the load with update should be
+; removed (resulting in a nearly-empty output).
+; CHECK-LABEL: @_ZN4llvm17ScheduleDAGInstrs14addPhysRegDepsEPNS_5SUnitEj
+; CHECK-NOT: lhzu
+
+entry:
+  %0 = load %"class.llvm::MachineOperand"** undef, align 8
+  br i1 undef, label %_ZNK4llvm14MachineOperand6getRegEv.exit, label %cond.false.i123
+
+cond.false.i123:                                  ; preds = %_ZN4llvm12MachineInstr10getOperandEj.exit
+  unreachable
+
+_ZNK4llvm14MachineOperand6getRegEv.exit:          ; preds = %_ZN4llvm12MachineInstr10getOperandEj.exit
+  %IsDef.i = getelementptr inbounds %"class.llvm::MachineOperand"* %0, i64 undef, i32 1
+  %1 = bitcast [3 x i8]* %IsDef.i to i24*
+  %bf.load.i = load i24* %1, align 1
+  %2 = and i24 %bf.load.i, 128
+  br i1 undef, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %_ZNK4llvm14MachineOperand6getRegEv.exit
+  %3 = zext i24 %2 to i32
+  br i1 undef, label %cond.false.i134, label %_ZNK4llvm18MCRegAliasIteratordeEv.exit
+
+for.cond.cleanup:                                 ; preds = %_ZNK4llvm14MachineOperand6getRegEv.exit
+  br i1 undef, label %_ZNK4llvm14MachineOperand5isDefEv.exit, label %cond.false.i129
+
+cond.false.i129:                                  ; preds = %for.cond.cleanup
+  unreachable
+
+_ZNK4llvm14MachineOperand5isDefEv.exit:           ; preds = %for.cond.cleanup
+  br i1 undef, label %_ZNK4llvm14MachineOperand6getRegEv.exit247, label %cond.false.i244
+
+cond.false.i134:                                  ; preds = %for.body.lr.ph
+  unreachable
+
+_ZNK4llvm18MCRegAliasIteratordeEv.exit:           ; preds = %for.body.lr.ph
+  unreachable
+
+cond.false.i244:                                  ; preds = %_ZNK4llvm14MachineOperand5isDefEv.exit
+  unreachable
+
+_ZNK4llvm14MachineOperand6getRegEv.exit247:       ; preds = %_ZNK4llvm14MachineOperand5isDefEv.exit
+  br i1 undef, label %if.then53, label %if.end55
+
+if.then53:                                        ; preds = %_ZNK4llvm14MachineOperand6getRegEv.exit247
+  unreachable
+
+if.end55:                                         ; preds = %_ZNK4llvm14MachineOperand6getRegEv.exit247
+  br i1 undef, label %_ZNK4llvm14MachineOperand6isDeadEv.exit262, label %cond.false.i257
+
+cond.false.i257:                                  ; preds = %if.end55
+  unreachable
+
+_ZNK4llvm14MachineOperand6isDeadEv.exit262:       ; preds = %if.end55
+  %bf.load.i259 = load i24* %1, align 1
+  br i1 undef, label %if.then57, label %if.else59
+
+if.then57:                                        ; preds = %_ZNK4llvm14MachineOperand6isDeadEv.exit262
+  unreachable
+
+if.else59:                                        ; preds = %_ZNK4llvm14MachineOperand6isDeadEv.exit262
+  br i1 undef, label %if.end89, label %if.then62
+
+if.then62:                                        ; preds = %if.else59
+  unreachable
+
+if.end89:                                         ; preds = %if.else59
+  unreachable
+}
+
+attributes #0 = { nounwind }
+
+
diff --git a/test/CodeGen/PowerPC/stack-realign.ll b/test/CodeGen/PowerPC/stack-realign.ll
index 1c7a36a..a59fceb 100644
--- a/test/CodeGen/PowerPC/stack-realign.ll
+++ b/test/CodeGen/PowerPC/stack-realign.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
 ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -disable-fp-elim < %s | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu -disable-fp-elim < %s | FileCheck -check-prefix=CHECK-32 %s
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu -disable-fp-elim -relocation-model=pic < %s | FileCheck -check-prefix=CHECK-32-PIC %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -7,6 +9,8 @@ target triple = "powerpc64-unknown-linux-gnu"
 
 declare void @bar(i32*)
 
+@barbaz = external global i32
+
 define void @goo(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32], align 32
@@ -16,8 +20,9 @@ entry:
   store i32 %0, i32* %arrayidx, align 32
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
   %1 = load i32* %b, align 4
+  %2 = load i32* @barbaz, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4
+  store i32 %2, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx)
   ret void
 }
@@ -69,6 +74,24 @@ entry:
 ; CHECK-FP-DAG: mtlr 0
 ; CHECK-FP: blr
 
+; CHECK-32-LABEL: @goo
+; CHECK-32-DAG: mflr 0
+; CHECK-32-DAG: rlwinm [[REG:[0-9]+]], 1, 0, 27, 31
+; CHECK-32-DAG: stw 30, -8(1)
+; CHECK-32-DAG: mr 30, 1
+; CHECK-32-DAG: stw 0, 4(1)
+; CHECK-32-DAG: subfic 0, [[REG]], -64
+; CHECK-32: stwux 1, 1, 0
+
+; CHECK-32-PIC-LABEL: @goo
+; CHECK-32-PIC-DAG: mflr 0
+; CHECK-32-PIC-DAG: rlwinm [[REG:[0-9]+]], 1, 0, 27, 31
+; CHECK-32-PIC-DAG: stw 29, -12(1)
+; CHECK-32-PIC-DAG: mr 29, 1
+; CHECK-32-PIC-DAG: stw 0, 4(1)
+; CHECK-32-PIC-DAG: subfic 0, [[REG]], -64
+; CHECK-32-PIC: stwux 1, 1, 0
+
 ; The large-frame-size case.
 define void @hoo(%struct.s* byval nocapture readonly %a) {
 entry:
@@ -99,6 +122,34 @@ entry:
 
 ; CHECK: blr
 
+; CHECK-32-LABEL: @hoo
+
+; CHECK-32-DAG: lis [[REG1:[0-9]+]], -13
+; CHECK-32-DAG: rlwinm [[REG3:[0-9]+]], 1, 0, 27, 31
+; CHECK-32-DAG: mflr 0
+; CHECK-32-DAG: ori [[REG2:[0-9]+]], [[REG1]], 51904
+; CHECK-32-DAG: stw 30, -8(1)
+; CHECK-32-DAG: mr 30, 1
+; CHECK-32-DAG: stw 0, 4(1)
+; CHECK-32-DAG: subfc 0, [[REG3]], [[REG2]]
+; CHECK-32: stwux 1, 1, 0
+
+; CHECK-32: blr
+
+; CHECK-32-PIC-LABEL: @hoo
+
+; CHECK-32-PIC-DAG: lis [[REG1:[0-9]+]], -13
+; CHECK-32-PIC-DAG: rlwinm [[REG3:[0-9]+]], 1, 0, 27, 31
+; CHECK-32-PIC-DAG: mflr 0
+; CHECK-32-PIC-DAG: ori [[REG2:[0-9]+]], [[REG1]], 51904
+; CHECK-32-PIC-DAG: stw 29, -12(1)
+; CHECK-32-PIC-DAG: mr 29, 1
+; CHECK-32-PIC-DAG: stw 0, 4(1)
+; CHECK-32-PIC-DAG: subfc 0, [[REG3]], [[REG2]]
+; CHECK-32: stwux 1, 1, 0
+
+; CHECK-32: blr
+
 ; Make sure that the FP save area is still allocated correctly relative to
 ; where r30 is saved.
 define void @loo(%struct.s* byval nocapture readonly %a) {
diff --git a/test/CodeGen/PowerPC/subsumes-pred-regs.ll b/test/CodeGen/PowerPC/subsumes-pred-regs.ll
index da637cd..c510e36 100644
--- a/test/CodeGen/PowerPC/subsumes-pred-regs.ll
+++ b/test/CodeGen/PowerPC/subsumes-pred-regs.ll
@@ -35,7 +35,7 @@ if.then9.i39:                                     ; preds = %if.end7.i37
   br i1 %lnot.i.i16.i23, label %return, label %lor.rhs.i.i49
 
 ; CHECK: .LBB0_7:
-; CHECK:	beq 1, .LBB0_10
+; CHECK:	bne 1, .LBB0_10
 ; CHECK:	beq 0, .LBB0_10
 ; CHECK: .LBB0_9:
 
diff --git a/test/CodeGen/PowerPC/tls-pic.ll b/test/CodeGen/PowerPC/tls-pic.ll
index 9f3ab6e..9ba3725 100644
--- a/test/CodeGen/PowerPC/tls-pic.ll
+++ b/test/CodeGen/PowerPC/tls-pic.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -march=ppc64 -mcpu=pwr7 -O0 -relocation-model=pic < %s | FileCheck -check-prefix=OPT0 %s
 ; RUN: llc -march=ppc64 -mcpu=pwr7 -O1 -relocation-model=pic < %s | FileCheck -check-prefix=OPT1 %s
+; RUN: llc -march=ppc32 -O0 -relocation-model=pic < %s | FileCheck -check-prefix=OPT0-32 %s
+; RUN: llc -march=ppc32 -O1 -relocation-model=pic < %s | FileCheck -check-prefix=OPT1-32 %s
 
 target triple = "powerpc64-unknown-linux-gnu"
 ; Test correct assembly code generation for thread-local storage using
@@ -22,6 +24,16 @@ entry:
 ; OPT0-NEXT: nop
 ; OPT0:      addis [[REG2:[0-9]+]], 3, a@dtprel@ha
 ; OPT0-NEXT: addi {{[0-9]+}}, [[REG2]], a@dtprel@l
+; OPT0-32-LABEL: main
+; OPT0-32:        addi {{[0-9]+}}, {{[0-9]+}}, a@got@tlsld
+; OPT0-32:        bl __tls_get_addr(a@tlsld)@PLT
+; OPT0-32:        addis [[REG:[0-9]+]], 3, a@dtprel@ha
+; OPT0-32-NEXT:   addi  {{[0-9]+}}, [[REG]], a@dtprel@l
+; OPT1-32-LABEL: main
+; OPT1-32:        addi 3, {{[0-9]+}}, a@got@tlsld
+; OPT1-32:        bl __tls_get_addr(a@tlsld)@PLT
+; OPT1-32:        addis [[REG:[0-9]+]], 3, a@dtprel@ha
+; OPT1-32-NEXT:   addi  {{[0-9]+}}, [[REG]], a@dtprel@l
 
 ; Test peephole optimization for thread-local storage using the
 ; local dynamic model.
@@ -52,4 +64,6 @@ entry:
 ; OPT1-NEXT: addi 3, [[REG]], a2@got@tlsgd@l
 ; OPT1:      bl __tls_get_addr(a2@tlsgd)
 ; OPT1-NEXT: nop
-
+; OPT1-32-LABEL: main2
+; OPT1-32:        addi 3, {{[0-9]+}}, a2@got@tlsgd
+; OPT1-32:        bl __tls_get_addr(a2@tlsgd)@PLT
diff --git a/test/CodeGen/PowerPC/tls-store2.ll b/test/CodeGen/PowerPC/tls-store2.ll
new file mode 100644
index 0000000..f884dd8
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-store2.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=ppc64 -mcpu=pwr7 -O2 -relocation-model=pic < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Test back-to-back stores of TLS variables to ensure call sequences no
+; longer overlap.
+
+@__once_callable = external thread_local global i8**
+@__once_call = external thread_local global void ()*
+
+define i64 @call_once(i64 %flag, i8* %ptr) {
+entry:
+  %var = alloca i8*, align 8
+  store i8* %ptr, i8** %var, align 8
+  store i8** %var, i8*** @__once_callable, align 8
+  store void ()* @__once_call_impl, void ()** @__once_call, align 8
+  ret i64 %flag
+}
+
+; CHECK-LABEL: call_once:
+; CHECK: addis 3, 2, __once_callable@got@tlsgd@ha
+; CHECK: addi 3, 3, __once_callable@got@tlsgd@l
+; CHECK: bl __tls_get_addr(__once_callable@tlsgd)
+; CHECK-NEXT: nop
+; CHECK: std {{[0-9]+}}, 0(3)
+; CHECK: addis 3, 2, __once_call@got@tlsgd@ha
+; CHECK: addi 3, 3, __once_call@got@tlsgd@l
+; CHECK: bl __tls_get_addr(__once_call@tlsgd)
+; CHECK-NEXT: nop
+; CHECK: std {{[0-9]+}}, 0(3)
+
+declare void @__once_call_impl()
diff --git a/test/CodeGen/PowerPC/toc-load-sched-bug.ll b/test/CodeGen/PowerPC/toc-load-sched-bug.ll
new file mode 100644
index 0000000..d437915
--- /dev/null
+++ b/test/CodeGen/PowerPC/toc-load-sched-bug.ll
@@ -0,0 +1,534 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; This test checks for misordering of a TOC restore instruction relative
+; to subsequent uses of the TOC register.  Previously this test broke
+; because there was no TOC register dependency between the instructions,
+; and the usual stack-adjust instructions that held the TOC restore in
+; place were optimized away.
+
+%"class.llvm::Module" = type { %"class.llvm::LLVMContext"*, %"class.llvm::iplist", %"class.llvm::iplist.0", %"class.llvm::iplist.9", %"struct.llvm::ilist", %"class.std::basic_string", %"class.llvm::ValueSymbolTable"*, %"class.llvm::StringMap", %"class.std::unique_ptr", %"class.std::basic_string", %"class.std::basic_string", i8*, %"class.llvm::RandomNumberGenerator"*, %"class.std::basic_string", %"class.llvm::DataLayout" }
+%"class.llvm::iplist" = type { %"struct.llvm::ilist_traits", %"class.llvm::GlobalVariable"* }
+%"struct.llvm::ilist_traits" = type { %"class.llvm::ilist_node" }
+%"class.llvm::ilist_node" = type { %"class.llvm::ilist_half_node", %"class.llvm::GlobalVariable"* }
+%"class.llvm::ilist_half_node" = type { %"class.llvm::GlobalVariable"* }
+%"class.llvm::GlobalVariable" = type { %"class.llvm::GlobalObject", %"class.llvm::ilist_node", i8 }
+%"class.llvm::GlobalObject" = type { %"class.llvm::GlobalValue", %"class.std::basic_string", %"class.llvm::Comdat"* }
+%"class.llvm::GlobalValue" = type { %"class.llvm::Constant", i32, %"class.llvm::Module"* }
+%"class.llvm::Constant" = type { %"class.llvm::User" }
+%"class.llvm::User" = type { %"class.llvm::Value.base", i32, %"class.llvm::Use"* }
+%"class.llvm::Value.base" = type <{ i32 (...)**, %"class.llvm::Type"*, %"class.llvm::Use"*, %"class.llvm::StringMapEntry"*, i8, i8, i16 }>
+%"class.llvm::Type" = type { %"class.llvm::LLVMContext"*, i32, i32, %"class.llvm::Type"** }
+%"class.llvm::StringMapEntry" = type opaque
+%"class.llvm::Use" = type { %"class.llvm::Value"*, %"class.llvm::Use"*, %"class.llvm::PointerIntPair" }
+%"class.llvm::Value" = type { i32 (...)**, %"class.llvm::Type"*, %"class.llvm::Use"*, %"class.llvm::StringMapEntry"*, i8, i8, i16 }
+%"class.llvm::PointerIntPair" = type { i64 }
+%"class.llvm::Comdat" = type { %"class.llvm::StringMapEntry.43"*, i32 }
+%"class.llvm::StringMapEntry.43" = type opaque
+%"class.llvm::iplist.0" = type { %"struct.llvm::ilist_traits.1", %"class.llvm::Function"* }
+%"struct.llvm::ilist_traits.1" = type { %"class.llvm::ilist_node.7" }
+%"class.llvm::ilist_node.7" = type { %"class.llvm::ilist_half_node.8", %"class.llvm::Function"* }
+%"class.llvm::ilist_half_node.8" = type { %"class.llvm::Function"* }
+%"class.llvm::Function" = type { %"class.llvm::GlobalObject", %"class.llvm::ilist_node.7", %"class.llvm::iplist.44", %"class.llvm::iplist.52", %"class.llvm::ValueSymbolTable"*, %"class.llvm::AttributeSet" }
+%"class.llvm::iplist.44" = type { %"struct.llvm::ilist_traits.45", %"class.llvm::BasicBlock"* }
+%"struct.llvm::ilist_traits.45" = type { %"class.llvm::ilist_half_node.51" }
+%"class.llvm::ilist_half_node.51" = type { %"class.llvm::BasicBlock"* }
+%"class.llvm::BasicBlock" = type { %"class.llvm::Value.base", %"class.llvm::ilist_node.61", %"class.llvm::iplist.62", %"class.llvm::Function"* }
+%"class.llvm::ilist_node.61" = type { %"class.llvm::ilist_half_node.51", %"class.llvm::BasicBlock"* }
+%"class.llvm::iplist.62" = type { %"struct.llvm::ilist_traits.63", %"class.llvm::Instruction"* }
+%"struct.llvm::ilist_traits.63" = type { %"class.llvm::ilist_half_node.69" }
+%"class.llvm::ilist_half_node.69" = type { %"class.llvm::Instruction"* }
+%"class.llvm::Instruction" = type { %"class.llvm::User", %"class.llvm::ilist_node.70", %"class.llvm::BasicBlock"*, %"class.llvm::DebugLoc" }
+%"class.llvm::ilist_node.70" = type { %"class.llvm::ilist_half_node.69", %"class.llvm::Instruction"* }
+%"class.llvm::DebugLoc" = type { i32, i32 }
+%"class.llvm::iplist.52" = type { %"struct.llvm::ilist_traits.53", %"class.llvm::Argument"* }
+%"struct.llvm::ilist_traits.53" = type { %"class.llvm::ilist_half_node.59" }
+%"class.llvm::ilist_half_node.59" = type { %"class.llvm::Argument"* }
+%"class.llvm::Argument" = type { %"class.llvm::Value.base", %"class.llvm::ilist_node.60", %"class.llvm::Function"* }
+%"class.llvm::ilist_node.60" = type { %"class.llvm::ilist_half_node.59", %"class.llvm::Argument"* }
+%"class.llvm::AttributeSet" = type { %"class.llvm::AttributeSetImpl"* }
+%"class.llvm::AttributeSetImpl" = type opaque
+%"class.llvm::iplist.9" = type { %"struct.llvm::ilist_traits.10", %"class.llvm::GlobalAlias"* }
+%"struct.llvm::ilist_traits.10" = type { %"class.llvm::ilist_node.16" }
+%"class.llvm::ilist_node.16" = type { %"class.llvm::ilist_half_node.17", %"class.llvm::GlobalAlias"* }
+%"class.llvm::ilist_half_node.17" = type { %"class.llvm::GlobalAlias"* }
+%"class.llvm::GlobalAlias" = type { %"class.llvm::GlobalValue", %"class.llvm::ilist_node.16" }
+%"struct.llvm::ilist" = type { %"class.llvm::iplist.18" }
+%"class.llvm::iplist.18" = type { %"struct.llvm::ilist_traits.19", %"class.llvm::NamedMDNode"* }
+%"struct.llvm::ilist_traits.19" = type { %"class.llvm::ilist_node.24" }
+%"class.llvm::ilist_node.24" = type { %"class.llvm::ilist_half_node.25", %"class.llvm::NamedMDNode"* }
+%"class.llvm::ilist_half_node.25" = type { %"class.llvm::NamedMDNode"* }
+%"class.llvm::NamedMDNode" = type { %"class.llvm::ilist_node.24", %"class.std::basic_string", %"class.llvm::Module"*, i8* }
+%"class.llvm::ValueSymbolTable" = type opaque
+%"class.llvm::StringMap" = type { %"class.llvm::StringMapImpl", %"class.llvm::MallocAllocator" }
+%"class.llvm::StringMapImpl" = type { %"class.llvm::StringMapEntryBase"**, i32, i32, i32, i32 }
+%"class.llvm::StringMapEntryBase" = type { i32 }
+%"class.llvm::MallocAllocator" = type { i8 }
+%"class.std::unique_ptr" = type { %"class.std::tuple" }
+%"class.std::tuple" = type { %"struct.std::_Tuple_impl" }
+%"struct.std::_Tuple_impl" = type { %"struct.std::_Head_base.28" }
+%"struct.std::_Head_base.28" = type { %"class.llvm::GVMaterializer"* }
+%"class.llvm::GVMaterializer" = type opaque
+%"class.llvm::RandomNumberGenerator" = type opaque
+%"class.std::basic_string" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" = type { i8* }
+%"class.llvm::DataLayout" = type { i8, i32, i32, [4 x i8], %"class.llvm::SmallVector", %"class.llvm::SmallVector.29", %"class.llvm::SmallVector.36", i8* }
+%"class.llvm::SmallVector" = type { %"class.llvm::SmallVectorImpl.base", %"struct.llvm::SmallVectorStorage" }
+%"class.llvm::SmallVectorImpl.base" = type { %"class.llvm::SmallVectorTemplateBase.base" }
+%"class.llvm::SmallVectorTemplateBase.base" = type { %"class.llvm::SmallVectorTemplateCommon.base" }
+%"class.llvm::SmallVectorTemplateCommon.base" = type <{ %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion" }>
+%"class.llvm::SmallVectorBase" = type { i8*, i8*, i8* }
+%"struct.llvm::AlignedCharArrayUnion" = type { %"struct.llvm::AlignedCharArray" }
+%"struct.llvm::AlignedCharArray" = type { [1 x i8] }
+%"struct.llvm::SmallVectorStorage" = type { [7 x %"struct.llvm::AlignedCharArrayUnion"] }
+%"class.llvm::SmallVector.29" = type { %"class.llvm::SmallVectorImpl.30", %"struct.llvm::SmallVectorStorage.35" }
+%"class.llvm::SmallVectorImpl.30" = type { %"class.llvm::SmallVectorTemplateBase.31" }
+%"class.llvm::SmallVectorTemplateBase.31" = type { %"class.llvm::SmallVectorTemplateCommon.32" }
+%"class.llvm::SmallVectorTemplateCommon.32" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.33" }
+%"struct.llvm::AlignedCharArrayUnion.33" = type { %"struct.llvm::AlignedCharArray.34" }
+%"struct.llvm::AlignedCharArray.34" = type { [8 x i8] }
+%"struct.llvm::SmallVectorStorage.35" = type { [15 x %"struct.llvm::AlignedCharArrayUnion.33"] }
+%"class.llvm::SmallVector.36" = type { %"class.llvm::SmallVectorImpl.37", %"struct.llvm::SmallVectorStorage.42" }
+%"class.llvm::SmallVectorImpl.37" = type { %"class.llvm::SmallVectorTemplateBase.38" }
+%"class.llvm::SmallVectorTemplateBase.38" = type { %"class.llvm::SmallVectorTemplateCommon.39" }
+%"class.llvm::SmallVectorTemplateCommon.39" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.40" }
+%"struct.llvm::AlignedCharArrayUnion.40" = type { %"struct.llvm::AlignedCharArray.41" }
+%"struct.llvm::AlignedCharArray.41" = type { [16 x i8] }
+%"struct.llvm::SmallVectorStorage.42" = type { [7 x %"struct.llvm::AlignedCharArrayUnion.40"] }
+%"class.llvm::SMDiagnostic" = type { %"class.llvm::SourceMgr"*, %"class.llvm::SMLoc", %"class.std::basic_string", i32, i32, i32, %"class.std::basic_string", %"class.std::basic_string", %"class.std::vector.79", %"class.llvm::SmallVector.84" }
+%"class.llvm::SourceMgr" = type { %"class.std::vector", %"class.std::vector.74", i8*, void (%"class.llvm::SMDiagnostic"*, i8*)*, i8* }
+%"class.std::vector" = type { %"struct.std::_Vector_base" }
+%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<llvm::SourceMgr::SrcBuffer, std::allocator<llvm::SourceMgr::SrcBuffer> >::_Vector_impl" }
+%"struct.std::_Vector_base<llvm::SourceMgr::SrcBuffer, std::allocator<llvm::SourceMgr::SrcBuffer> >::_Vector_impl" = type { %"struct.llvm::SourceMgr::SrcBuffer"*, %"struct.llvm::SourceMgr::SrcBuffer"*, %"struct.llvm::SourceMgr::SrcBuffer"* }
+%"struct.llvm::SourceMgr::SrcBuffer" = type { %"class.llvm::MemoryBuffer"*, %"class.llvm::SMLoc" }
+%"class.llvm::MemoryBuffer" = type { i32 (...)**, i8*, i8* }
+%"class.std::vector.74" = type { %"struct.std::_Vector_base.75" }
+%"struct.std::_Vector_base.75" = type { %"struct.std::_Vector_base<std::basic_string<char>, std::allocator<std::basic_string<char> > >::_Vector_impl" }
+%"struct.std::_Vector_base<std::basic_string<char>, std::allocator<std::basic_string<char> > >::_Vector_impl" = type { %"class.std::basic_string"*, %"class.std::basic_string"*, %"class.std::basic_string"* }
+%"class.llvm::SMLoc" = type { i8* }
+%"class.std::vector.79" = type { %"struct.std::_Vector_base.80" }
+%"struct.std::_Vector_base.80" = type { %"struct.std::_Vector_base<std::pair<unsigned int, unsigned int>, std::allocator<std::pair<unsigned int, unsigned int> > >::_Vector_impl" }
+%"struct.std::_Vector_base<std::pair<unsigned int, unsigned int>, std::allocator<std::pair<unsigned int, unsigned int> > >::_Vector_impl" = type { %"struct.std::pair"*, %"struct.std::pair"*, %"struct.std::pair"* }
+%"struct.std::pair" = type { i32, i32 }
+%"class.llvm::SmallVector.84" = type { %"class.llvm::SmallVectorImpl.85", %"struct.llvm::SmallVectorStorage.90" }
+%"class.llvm::SmallVectorImpl.85" = type { %"class.llvm::SmallVectorTemplateBase.86" }
+%"class.llvm::SmallVectorTemplateBase.86" = type { %"class.llvm::SmallVectorTemplateCommon.87" }
+%"class.llvm::SmallVectorTemplateCommon.87" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.88" }
+%"struct.llvm::AlignedCharArrayUnion.88" = type { %"struct.llvm::AlignedCharArray.89" }
+%"struct.llvm::AlignedCharArray.89" = type { [24 x i8] }
+%"struct.llvm::SmallVectorStorage.90" = type { [3 x %"struct.llvm::AlignedCharArrayUnion.88"] }
+%"class.llvm::LLVMContext" = type { %"class.llvm::LLVMContextImpl"* }
+%"class.llvm::LLVMContextImpl" = type opaque
+%"class.std::allocator" = type { i8 }
+%"class.llvm::ErrorOr.109" = type { %union.anon.110, i8, [7 x i8] }
+%union.anon.110 = type { %"struct.llvm::AlignedCharArrayUnion.93" }
+%"struct.llvm::AlignedCharArrayUnion.93" = type { %"struct.llvm::AlignedCharArray.94" }
+%"struct.llvm::AlignedCharArray.94" = type { [16 x i8] }
+%"class.llvm::ErrorOr" = type { %union.anon, i8, [7 x i8] }
+%union.anon = type { %"struct.llvm::AlignedCharArrayUnion.93" }
+%"class.std::error_category" = type { i32 (...)** }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep_base" }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep_base" = type { i64, i64, i32 }
+%"class.llvm::SMFixIt" = type { %"class.llvm::SMRange", %"class.std::basic_string" }
+%"class.llvm::SMRange" = type { %"class.llvm::SMLoc", %"class.llvm::SMLoc" }
+%"struct.llvm::NamedRegionTimer" = type { %"class.llvm::TimeRegion" }
+%"class.llvm::TimeRegion" = type { %"class.llvm::Timer"* }
+%"class.llvm::Timer" = type { %"class.llvm::TimeRecord", %"class.std::basic_string", i8, %"class.llvm::TimerGroup"*, %"class.llvm::Timer"**, %"class.llvm::Timer"* }
+%"class.llvm::TimeRecord" = type { double, double, double, i64 }
+%"class.llvm::TimerGroup" = type { %"class.std::basic_string", %"class.llvm::Timer"*, %"class.std::vector.103", %"class.llvm::TimerGroup"**, %"class.llvm::TimerGroup"* }
+%"class.std::vector.103" = type { %"struct.std::_Vector_base.104" }
+%"struct.std::_Vector_base.104" = type { %"struct.std::_Vector_base<std::pair<llvm::TimeRecord, std::basic_string<char> >, std::allocator<std::pair<llvm::TimeRecord, std::basic_string<char> > > >::_Vector_impl" }
+%"struct.std::_Vector_base<std::pair<llvm::TimeRecord, std::basic_string<char> >, std::allocator<std::pair<llvm::TimeRecord, std::basic_string<char> > > >::_Vector_impl" = type { %"struct.std::pair.108"*, %"struct.std::pair.108"*, %"struct.std::pair.108"* }
+%"struct.std::pair.108" = type opaque
+%struct.LLVMOpaqueContext = type opaque
+%struct.LLVMOpaqueMemoryBuffer = type opaque
+%struct.LLVMOpaqueModule = type opaque
+%"class.llvm::raw_string_ostream" = type { %"class.llvm::raw_ostream.base", %"class.std::basic_string"* }
+%"class.llvm::raw_ostream.base" = type <{ i32 (...)**, i8*, i8*, i8*, i32 }>
+%"class.llvm::raw_ostream" = type { i32 (...)**, i8*, i8*, i8*, i32 }
+
+@.str = private unnamed_addr constant [28 x i8] c"Could not open input file: \00", align 1
+@.str1 = private unnamed_addr constant [54 x i8] c"!HasError && \22Cannot get value when an error exists!\22\00", align 1
+@.str2 = private unnamed_addr constant [61 x i8] c"/home/wschmidt/llvm/llvm-test/include/llvm/Support/ErrorOr.h\00", align 1
+@__PRETTY_FUNCTION__._ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE10getStorageEv = private unnamed_addr constant [206 x i8] c"storage_type *llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer> > >::getStorage() [T = std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer> >]\00", align 1
+@_ZNSs4_Rep20_S_empty_rep_storageE = external global [0 x i64]
+
+declare void @_ZN4llvm12MemoryBuffer14getFileOrSTDINENS_9StringRefEl(%"class.llvm::ErrorOr"* sret, [2 x i64], i64) #1
+
+declare void @_ZN4llvm16NamedRegionTimerC1ENS_9StringRefES1_b(%"struct.llvm::NamedRegionTimer"*, [2 x i64], [2 x i64], i1 zeroext) #1
+
+; Function Attrs: nounwind
+define %"class.llvm::Module"* @_ZN4llvm11ParseIRFileERKSsRNS_12SMDiagnosticERNS_11LLVMContextE(%"class.std::basic_string"* nocapture readonly dereferenceable(8) %Filename, %"class.llvm::SMDiagnostic"* dereferenceable(200) %Err, %"class.llvm::LLVMContext"* dereferenceable(8) %Context) #0 {
+entry:
+; CHECK: .globl	_ZN4llvm11ParseIRFileERKSsRNS_12SMDiagnosticERNS_11LLVMContextE
+; CHECK: bctrl
+; CHECK: ld 2, 24(1)
+; CHECK: addis [[REG:[0-9]+]], 2, .L.str@toc@ha
+; CHECK: addi {{[0-9]+}}, [[REG]], .L.str@toc@l
+; CHECK: bl _ZNSs6insertEmPKcm
+  %.atomicdst.i.i.i.i.i46 = alloca i32, align 4
+  %ref.tmp.i.i47 = alloca %"class.std::allocator", align 1
+  %.atomicdst.i.i.i.i.i = alloca i32, align 4
+  %ref.tmp.i.i = alloca %"class.std::allocator", align 1
+  %ref.tmp.i.i2.i = alloca %"class.std::allocator", align 1
+  %ref.tmp.i.i.i = alloca %"class.std::allocator", align 1
+  %FileOrErr = alloca %"class.llvm::ErrorOr", align 8
+  %ref.tmp = alloca %"class.llvm::SMDiagnostic", align 8
+  %ref.tmp5 = alloca %"class.std::basic_string", align 8
+  %_M_p.i.i.i = getelementptr inbounds %"class.std::basic_string"* %Filename, i64 0, i32 0, i32 0
+  %0 = load i8** %_M_p.i.i.i, align 8, !tbaa !1
+  %1 = ptrtoint i8* %0 to i64
+  %arrayidx.i.i.i = getelementptr inbounds i8* %0, i64 -24
+  %_M_length.i.i = bitcast i8* %arrayidx.i.i.i to i64*
+  %2 = load i64* %_M_length.i.i, align 8, !tbaa !7
+  %.fca.0.insert18 = insertvalue [2 x i64] undef, i64 %1, 0
+  %.fca.1.insert21 = insertvalue [2 x i64] %.fca.0.insert18, i64 %2, 1
+  call void @_ZN4llvm12MemoryBuffer14getFileOrSTDINENS_9StringRefEl(%"class.llvm::ErrorOr"* sret %FileOrErr, [2 x i64] %.fca.1.insert21, i64 -1) #3
+  %HasError.i24 = getelementptr inbounds %"class.llvm::ErrorOr"* %FileOrErr, i64 0, i32 1
+  %bf.load.i25 = load i8* %HasError.i24, align 8
+  %3 = and i8 %bf.load.i25, 1
+  %bf.cast.i26 = icmp eq i8 %3, 0
+  br i1 %bf.cast.i26, label %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE3getEv.exit, label %_ZNK4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE8getErrorEv.exit
+
+_ZNK4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE8getErrorEv.exit: ; preds = %entry
+  %retval.sroa.0.0..sroa_cast.i = bitcast %"class.llvm::ErrorOr"* %FileOrErr to i64*
+  %retval.sroa.0.0.copyload.i = load i64* %retval.sroa.0.0..sroa_cast.i, align 8
+  %retval.sroa.3.0..sroa_idx.i = getelementptr inbounds %"class.llvm::ErrorOr"* %FileOrErr, i64 0, i32 0, i32 0, i32 0, i32 0, i64 8
+  %retval.sroa.3.0..sroa_cast.i = bitcast i8* %retval.sroa.3.0..sroa_idx.i to i64*
+  %retval.sroa.3.0.copyload.i = load i64* %retval.sroa.3.0..sroa_cast.i, align 8
+  %phitmp = trunc i64 %retval.sroa.0.0.copyload.i to i32
+  %cmp.i = icmp eq i32 %phitmp, 0
+  br i1 %cmp.i, label %cond.false.i.i, label %if.then
+
+if.then:                                          ; preds = %_ZNK4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE8getErrorEv.exit
+  %.c = inttoptr i64 %retval.sroa.3.0.copyload.i to %"class.std::error_category"*
+  %4 = load i8** %_M_p.i.i.i, align 8, !tbaa !1
+  %arrayidx.i.i.i30 = getelementptr inbounds i8* %4, i64 -24
+  %_M_length.i.i31 = bitcast i8* %arrayidx.i.i.i30 to i64*
+  %5 = load i64* %_M_length.i.i31, align 8, !tbaa !7
+  %6 = inttoptr i64 %retval.sroa.3.0.copyload.i to void (%"class.std::basic_string"*, %"class.std::error_category"*, i32)***
+  %vtable.i = load void (%"class.std::basic_string"*, %"class.std::error_category"*, i32)*** %6, align 8, !tbaa !11
+  %vfn.i = getelementptr inbounds void (%"class.std::basic_string"*, %"class.std::error_category"*, i32)** %vtable.i, i64 3
+  %7 = load void (%"class.std::basic_string"*, %"class.std::error_category"*, i32)** %vfn.i, align 8
+  call void %7(%"class.std::basic_string"* sret %ref.tmp5, %"class.std::error_category"* %.c, i32 signext %phitmp) #3
+  %call2.i.i = call dereferenceable(8) %"class.std::basic_string"* @_ZNSs6insertEmPKcm(%"class.std::basic_string"* %ref.tmp5, i64 0, i8* getelementptr inbounds ([28 x i8]* @.str, i64 0, i64 0), i64 27) #3
+  %_M_p2.i.i.i.i = getelementptr inbounds %"class.std::basic_string"* %call2.i.i, i64 0, i32 0, i32 0
+  %8 = load i8** %_M_p2.i.i.i.i, align 8, !tbaa !13
+  store i8* bitcast (i64* getelementptr inbounds ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE, i64 0, i64 3) to i8*), i8** %_M_p2.i.i.i.i, align 8, !tbaa !1
+  %arrayidx.i.i.i36 = getelementptr inbounds i8* %8, i64 -24
+  %_M_length.i.i37 = bitcast i8* %arrayidx.i.i.i36 to i64*
+  %9 = load i64* %_M_length.i.i37, align 8, !tbaa !7
+  %Filename.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 2
+  %10 = getelementptr inbounds %"class.std::allocator"* %ref.tmp.i.i2.i, i64 0, i32 0
+  %11 = bitcast %"class.llvm::SMDiagnostic"* %ref.tmp to i8*
+  call void @llvm.memset.p0i8.i64(i8* %11, i8 0, i64 16, i32 8, i1 false) #3
+  call void @llvm.lifetime.start(i64 1, i8* %10) #3
+  %tobool.i.i4.i = icmp eq i8* %4, null
+  br i1 %tobool.i.i4.i, label %if.then.i.i6.i, label %if.end.i.i8.i
+
+if.then.i.i6.i:                                   ; preds = %if.then
+  %_M_p.i.i.i.i.i.i5.i = getelementptr inbounds %"class.std::basic_string"* %Filename.i, i64 0, i32 0, i32 0
+  store i8* bitcast (i64* getelementptr inbounds ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE, i64 0, i64 3) to i8*), i8** %_M_p.i.i.i.i.i.i5.i, align 8, !tbaa !13
+  br label %_ZNK4llvm9StringRefcvSsEv.exit9.i
+
+if.end.i.i8.i:                                    ; preds = %if.then
+  call void @_ZNSsC1EPKcmRKSaIcE(%"class.std::basic_string"* %Filename.i, i8* %4, i64 %5, %"class.std::allocator"* dereferenceable(1) %ref.tmp.i.i2.i) #3
+  br label %_ZNK4llvm9StringRefcvSsEv.exit9.i
+
+_ZNK4llvm9StringRefcvSsEv.exit9.i:                ; preds = %if.end.i.i8.i, %if.then.i.i6.i
+  call void @llvm.lifetime.end(i64 1, i8* %10) #3
+  %LineNo.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 3
+  store i32 -1, i32* %LineNo.i, align 8, !tbaa !14
+  %ColumnNo.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 4
+  store i32 -1, i32* %ColumnNo.i, align 4, !tbaa !21
+  %Kind.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 5
+  store i32 0, i32* %Kind.i, align 8, !tbaa !22
+  %Message.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 6
+  %12 = getelementptr inbounds %"class.std::allocator"* %ref.tmp.i.i.i, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 1, i8* %12) #3
+  %tobool.i.i.i = icmp eq i8* %8, null
+  br i1 %tobool.i.i.i, label %if.then.i.i.i, label %if.end.i.i.i
+
+if.then.i.i.i:                                    ; preds = %_ZNK4llvm9StringRefcvSsEv.exit9.i
+  %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::basic_string"* %Message.i, i64 0, i32 0, i32 0
+  store i8* bitcast (i64* getelementptr inbounds ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE, i64 0, i64 3) to i8*), i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !13
+  br label %_ZN4llvm12SMDiagnosticC2ENS_9StringRefENS_9SourceMgr8DiagKindES1_.exit
+
+if.end.i.i.i:                                     ; preds = %_ZNK4llvm9StringRefcvSsEv.exit9.i
+  call void @_ZNSsC1EPKcmRKSaIcE(%"class.std::basic_string"* %Message.i, i8* %8, i64 %9, %"class.std::allocator"* dereferenceable(1) %ref.tmp.i.i.i) #3
+  br label %_ZN4llvm12SMDiagnosticC2ENS_9StringRefENS_9SourceMgr8DiagKindES1_.exit
+
+_ZN4llvm12SMDiagnosticC2ENS_9StringRefENS_9SourceMgr8DiagKindES1_.exit: ; preds = %if.then.i.i.i, %if.end.i.i.i
+  call void @llvm.lifetime.end(i64 1, i8* %12) #3
+  %_M_p.i.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 7, i32 0, i32 0
+  store i8* bitcast (i64* getelementptr inbounds ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE, i64 0, i64 3) to i8*), i8** %_M_p.i.i.i.i.i, align 8, !tbaa !13
+  %Ranges.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 8
+  %13 = bitcast %"class.std::vector.79"* %Ranges.i to i8*
+  call void @llvm.memset.p0i8.i64(i8* %13, i8 0, i64 24, i32 8, i1 false) #3
+  %14 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 9, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i64 0
+  %BeginX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0
+  store i8* %14, i8** %BeginX.i.i.i.i.i.i, align 8, !tbaa !23
+  %EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 1
+  store i8* %14, i8** %EndX.i.i.i.i.i.i, align 8, !tbaa !25
+  %CapacityX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 2
+  %add.ptr.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 9, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i64 96
+  store i8* %add.ptr.i.i.i.i.i.i, i8** %CapacityX.i.i.i.i.i.i, align 8, !tbaa !26
+  %15 = bitcast %"class.llvm::SMDiagnostic"* %Err to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %15, i8* %11, i64 16, i32 8, i1 false) #3
+  %Filename.i38 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 2
+  call void @_ZNSs4swapERSs(%"class.std::basic_string"* %Filename.i38, %"class.std::basic_string"* dereferenceable(8) %Filename.i) #3
+  %LineNo.i39 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 3
+  %16 = bitcast i32* %LineNo.i39 to i8*
+  %17 = bitcast i32* %LineNo.i to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %16, i8* %17, i64 12, i32 4, i1 false) #3
+  %Message.i40 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 6
+  call void @_ZNSs4swapERSs(%"class.std::basic_string"* %Message.i40, %"class.std::basic_string"* dereferenceable(8) %Message.i) #3
+  %LineContents.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 7
+  %LineContents7.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 7
+  call void @_ZNSs4swapERSs(%"class.std::basic_string"* %LineContents.i, %"class.std::basic_string"* dereferenceable(8) %LineContents7.i) #3
+  %Ranges.i41 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 8
+  %_M_start.i7.i.i.i = getelementptr inbounds %"class.std::vector.79"* %Ranges.i41, i64 0, i32 0, i32 0, i32 0
+  %18 = load %"struct.std::pair"** %_M_start.i7.i.i.i, align 8, !tbaa !27
+  %_M_finish.i9.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 8, i32 0, i32 0, i32 1
+  %_M_end_of_storage.i11.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 8, i32 0, i32 0, i32 2
+  %_M_start2.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 8, i32 0, i32 0, i32 0
+  %19 = bitcast %"class.std::vector.79"* %Ranges.i41 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %19, i8 0, i64 16, i32 8, i1 false) #3
+  %20 = load %"struct.std::pair"** %_M_start2.i.i.i.i, align 8, !tbaa !27
+  store %"struct.std::pair"* %20, %"struct.std::pair"** %_M_start.i7.i.i.i, align 8, !tbaa !27
+  store %"struct.std::pair"* null, %"struct.std::pair"** %_M_start2.i.i.i.i, align 8, !tbaa !27
+  %_M_finish3.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 8, i32 0, i32 0, i32 1
+  %21 = load %"struct.std::pair"** %_M_finish3.i.i.i.i, align 8, !tbaa !27
+  store %"struct.std::pair"* %21, %"struct.std::pair"** %_M_finish.i9.i.i.i, align 8, !tbaa !27
+  store %"struct.std::pair"* null, %"struct.std::pair"** %_M_finish3.i.i.i.i, align 8, !tbaa !27
+  %_M_end_of_storage4.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 8, i32 0, i32 0, i32 2
+  %22 = load %"struct.std::pair"** %_M_end_of_storage4.i.i.i.i, align 8, !tbaa !27
+  store %"struct.std::pair"* %22, %"struct.std::pair"** %_M_end_of_storage.i11.i.i.i, align 8, !tbaa !27
+  store %"struct.std::pair"* null, %"struct.std::pair"** %_M_end_of_storage4.i.i.i.i, align 8, !tbaa !27
+  %tobool.i.i.i.i.i.i = icmp eq %"struct.std::pair"* %18, null
+  br i1 %tobool.i.i.i.i.i.i, label %_ZN4llvm12SMDiagnosticaSEOS0_.exit, label %if.then.i.i.i.i.i.i
+
+if.then.i.i.i.i.i.i:                              ; preds = %_ZN4llvm12SMDiagnosticC2ENS_9StringRefENS_9SourceMgr8DiagKindES1_.exit
+  %23 = bitcast %"struct.std::pair"* %18 to i8*
+  call void @_ZdlPv(i8* %23) #3
+  br label %_ZN4llvm12SMDiagnosticaSEOS0_.exit
+
+_ZN4llvm12SMDiagnosticaSEOS0_.exit:               ; preds = %_ZN4llvm12SMDiagnosticC2ENS_9StringRefENS_9SourceMgr8DiagKindES1_.exit, %if.then.i.i.i.i.i.i
+  %24 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 9, i32 0
+  %25 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 9, i32 0
+  %call2.i.i42 = call dereferenceable(48) %"class.llvm::SmallVectorImpl.85"* @_ZN4llvm15SmallVectorImplINS_7SMFixItEEaSEOS2_(%"class.llvm::SmallVectorImpl.85"* %24, %"class.llvm::SmallVectorImpl.85"* dereferenceable(48) %25) #3
+  call void @_ZN4llvm12SMDiagnosticD2Ev(%"class.llvm::SMDiagnostic"* %ref.tmp) #3
+  %26 = getelementptr inbounds %"class.std::allocator"* %ref.tmp.i.i, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 1, i8* %26) #3
+  %27 = bitcast i8* %arrayidx.i.i.i36 to %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep"*
+  %cmp.i.i.i = icmp eq i8* %arrayidx.i.i.i36, bitcast ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE to i8*)
+  br i1 %cmp.i.i.i, label %_ZNSsD1Ev.exit, label %if.then.i.i.i45, !prof !28
+
+if.then.i.i.i45:                                  ; preds = %_ZN4llvm12SMDiagnosticaSEOS0_.exit
+  %_M_refcount.i.i.i = getelementptr inbounds i8* %8, i64 -8
+  %28 = bitcast i8* %_M_refcount.i.i.i to i32*
+  br i1 icmp ne (i8* bitcast (i32 (i32*, void (i8*)*)* @__pthread_key_create to i8*), i8* null), label %if.then.i.i.i.i, label %if.else.i.i.i.i
+
+if.then.i.i.i.i:                                  ; preds = %if.then.i.i.i45
+  %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast = bitcast i32* %.atomicdst.i.i.i.i.i to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
+  %29 = atomicrmw volatile add i32* %28, i32 -1 acq_rel
+  store i32 %29, i32* %.atomicdst.i.i.i.i.i, align 4
+  %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..atomicdst.0..atomicdst.0..i.i.i.i.i = load volatile i32* %.atomicdst.i.i.i.i.i, align 4
+  call void @llvm.lifetime.end(i64 4, i8* %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
+  br label %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i
+
+if.else.i.i.i.i:                                  ; preds = %if.then.i.i.i45
+  %30 = load i32* %28, align 4, !tbaa !29
+  %add.i.i.i.i.i = add nsw i32 %30, -1
+  store i32 %add.i.i.i.i.i, i32* %28, align 4, !tbaa !29
+  br label %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i
+
+_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i: ; preds = %if.else.i.i.i.i, %if.then.i.i.i.i
+  %retval.0.i.i.i.i = phi i32 [ %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..atomicdst.0..atomicdst.0..i.i.i.i.i, %if.then.i.i.i.i ], [ %30, %if.else.i.i.i.i ]
+  %cmp3.i.i.i = icmp slt i32 %retval.0.i.i.i.i, 1
+  br i1 %cmp3.i.i.i, label %if.then4.i.i.i, label %_ZNSsD1Ev.exit
+
+if.then4.i.i.i:                                   ; preds = %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i
+  call void @_ZNSs4_Rep10_M_destroyERKSaIcE(%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep"* %27, %"class.std::allocator"* dereferenceable(1) %ref.tmp.i.i) #3
+  br label %_ZNSsD1Ev.exit
+
+_ZNSsD1Ev.exit:                                   ; preds = %_ZN4llvm12SMDiagnosticaSEOS0_.exit, %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i, %if.then4.i.i.i
+  call void @llvm.lifetime.end(i64 1, i8* %26) #3
+  %31 = getelementptr inbounds %"class.std::allocator"* %ref.tmp.i.i47, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 1, i8* %31) #3
+  %_M_p.i.i.i.i48 = getelementptr inbounds %"class.std::basic_string"* %ref.tmp5, i64 0, i32 0, i32 0
+  %32 = load i8** %_M_p.i.i.i.i48, align 8, !tbaa !1
+  %arrayidx.i.i.i49 = getelementptr inbounds i8* %32, i64 -24
+  %33 = bitcast i8* %arrayidx.i.i.i49 to %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep"*
+  %cmp.i.i.i50 = icmp eq i8* %arrayidx.i.i.i49, bitcast ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE to i8*)
+  br i1 %cmp.i.i.i50, label %_ZNSsD1Ev.exit62, label %if.then.i.i.i52, !prof !28
+
+if.then.i.i.i52:                                  ; preds = %_ZNSsD1Ev.exit
+  %_M_refcount.i.i.i51 = getelementptr inbounds i8* %32, i64 -8
+  %34 = bitcast i8* %_M_refcount.i.i.i51 to i32*
+  br i1 icmp ne (i8* bitcast (i32 (i32*, void (i8*)*)* @__pthread_key_create to i8*), i8* null), label %if.then.i.i.i.i55, label %if.else.i.i.i.i57
+
+if.then.i.i.i.i55:                                ; preds = %if.then.i.i.i52
+  %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast = bitcast i32* %.atomicdst.i.i.i.i.i46 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
+  %35 = atomicrmw volatile add i32* %34, i32 -1 acq_rel
+  store i32 %35, i32* %.atomicdst.i.i.i.i.i46, align 4
+  %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..atomicdst.0..atomicdst.0..i.i.i.i.i54 = load volatile i32* %.atomicdst.i.i.i.i.i46, align 4
+  call void @llvm.lifetime.end(i64 4, i8* %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
+  br label %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i60
+
+if.else.i.i.i.i57:                                ; preds = %if.then.i.i.i52
+  %36 = load i32* %34, align 4, !tbaa !29
+  %add.i.i.i.i.i56 = add nsw i32 %36, -1
+  store i32 %add.i.i.i.i.i56, i32* %34, align 4, !tbaa !29
+  br label %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i60
+
+_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i60: ; preds = %if.else.i.i.i.i57, %if.then.i.i.i.i55
+  %retval.0.i.i.i.i58 = phi i32 [ %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..atomicdst.0..atomicdst.0..i.i.i.i.i54, %if.then.i.i.i.i55 ], [ %36, %if.else.i.i.i.i57 ]
+  %cmp3.i.i.i59 = icmp slt i32 %retval.0.i.i.i.i58, 1
+  br i1 %cmp3.i.i.i59, label %if.then4.i.i.i61, label %_ZNSsD1Ev.exit62
+
+if.then4.i.i.i61:                                 ; preds = %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i60
+  call void @_ZNSs4_Rep10_M_destroyERKSaIcE(%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep"* %33, %"class.std::allocator"* dereferenceable(1) %ref.tmp.i.i47) #3
+  br label %_ZNSsD1Ev.exit62
+
+_ZNSsD1Ev.exit62:                                 ; preds = %_ZNSsD1Ev.exit, %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i60, %if.then4.i.i.i61
+  call void @llvm.lifetime.end(i64 1, i8* %31) #3
+  br label %cleanup
+
+cond.false.i.i:                                   ; preds = %_ZNK4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE8getErrorEv.exit
+  call void @__assert_fail(i8* getelementptr inbounds ([54 x i8]* @.str1, i64 0, i64 0), i8* getelementptr inbounds ([61 x i8]* @.str2, i64 0, i64 0), i32 zeroext 242, i8* getelementptr inbounds ([206 x i8]* @__PRETTY_FUNCTION__._ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE10getStorageEv, i64 0, i64 0)) #7
+  unreachable
+
+_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE3getEv.exit: ; preds = %entry
+  %_M_head_impl.i.i.i.i.i = bitcast %"class.llvm::ErrorOr"* %FileOrErr to %"class.llvm::MemoryBuffer"**
+  %37 = load %"class.llvm::MemoryBuffer"** %_M_head_impl.i.i.i.i.i, align 8, !tbaa !27
+  %call9 = call %"class.llvm::Module"* @_ZN4llvm7ParseIREPNS_12MemoryBufferERNS_12SMDiagnosticERNS_11LLVMContextE(%"class.llvm::MemoryBuffer"* %37, %"class.llvm::SMDiagnostic"* dereferenceable(200) %Err, %"class.llvm::LLVMContext"* dereferenceable(8) %Context)
+  br label %cleanup
+
+cleanup:                                          ; preds = %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE3getEv.exit, %_ZNSsD1Ev.exit62
+  %retval.0 = phi %"class.llvm::Module"* [ null, %_ZNSsD1Ev.exit62 ], [ %call9, %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE3getEv.exit ]
+  %bf.load.i = load i8* %HasError.i24, align 8
+  %38 = and i8 %bf.load.i, 1
+  %bf.cast.i = icmp eq i8 %38, 0
+  br i1 %bf.cast.i, label %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE10getStorageEv.exit.i, label %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEED2Ev.exit
+
+_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE10getStorageEv.exit.i: ; preds = %cleanup
+  %_M_head_impl.i.i.i.i.i.i = bitcast %"class.llvm::ErrorOr"* %FileOrErr to %"class.llvm::MemoryBuffer"**
+  %39 = load %"class.llvm::MemoryBuffer"** %_M_head_impl.i.i.i.i.i.i, align 8, !tbaa !27
+  %cmp.i.i = icmp eq %"class.llvm::MemoryBuffer"* %39, null
+  br i1 %cmp.i.i, label %_ZNSt10unique_ptrIN4llvm12MemoryBufferESt14default_deleteIS1_EED2Ev.exit.i, label %_ZNKSt14default_deleteIN4llvm12MemoryBufferEEclEPS1_.exit.i.i
+
+_ZNKSt14default_deleteIN4llvm12MemoryBufferEEclEPS1_.exit.i.i: ; preds = %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE10getStorageEv.exit.i
+  %40 = bitcast %"class.llvm::MemoryBuffer"* %39 to void (%"class.llvm::MemoryBuffer"*)***
+  %vtable.i.i.i = load void (%"class.llvm::MemoryBuffer"*)*** %40, align 8, !tbaa !11
+  %vfn.i.i.i = getelementptr inbounds void (%"class.llvm::MemoryBuffer"*)** %vtable.i.i.i, i64 1
+  %41 = load void (%"class.llvm::MemoryBuffer"*)** %vfn.i.i.i, align 8
+  call void %41(%"class.llvm::MemoryBuffer"* %39) #3
+  br label %_ZNSt10unique_ptrIN4llvm12MemoryBufferESt14default_deleteIS1_EED2Ev.exit.i
+
+_ZNSt10unique_ptrIN4llvm12MemoryBufferESt14default_deleteIS1_EED2Ev.exit.i: ; preds = %_ZNKSt14default_deleteIN4llvm12MemoryBufferEEclEPS1_.exit.i.i, %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE10getStorageEv.exit.i
+  store %"class.llvm::MemoryBuffer"* null, %"class.llvm::MemoryBuffer"** %_M_head_impl.i.i.i.i.i.i, align 8, !tbaa !27
+  br label %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEED2Ev.exit
+
+_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEED2Ev.exit: ; preds = %cleanup, %_ZNSt10unique_ptrIN4llvm12MemoryBufferESt14default_deleteIS1_EED2Ev.exit.i
+  ret %"class.llvm::Module"* %retval.0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #3
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #3
+
+; Function Attrs: noreturn nounwind
+declare void @__assert_fail(i8*, i8*, i32 zeroext, i8*) #4
+
+declare dereferenceable(8) %"class.std::basic_string"* @_ZNSs6insertEmPKcm(%"class.std::basic_string"*, i64, i8*, i64) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #3
+
+; Function Attrs: nounwind
+declare void @_ZNSs4_Rep10_M_destroyERKSaIcE(%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep"*, %"class.std::allocator"* dereferenceable(1)) #0
+
+; Function Attrs: nounwind
+declare extern_weak signext i32 @__pthread_key_create(i32*, void (i8*)*) #0
+
+; Function Attrs: nobuiltin nounwind
+declare void @_ZdlPv(i8*) #6
+
+declare void @_ZNSsC1EPKcmRKSaIcE(%"class.std::basic_string"*, i8*, i64, %"class.std::allocator"* dereferenceable(1)) #1
+
+declare hidden void @_ZN4llvm12SMDiagnosticD2Ev(%"class.llvm::SMDiagnostic"* readonly %this) unnamed_addr #2 align 2
+
+declare dereferenceable(48) %"class.llvm::SmallVectorImpl.85"* @_ZN4llvm15SmallVectorImplINS_7SMFixItEEaSEOS2_(%"class.llvm::SmallVectorImpl.85"* %this, %"class.llvm::SmallVectorImpl.85"* dereferenceable(48) %RHS) #0 align 2
+
+declare %"class.llvm::Module"* @_ZN4llvm7ParseIREPNS_12MemoryBufferERNS_12SMDiagnosticERNS_11LLVMContextE(%"class.llvm::MemoryBuffer"* %Buffer, %"class.llvm::SMDiagnostic"* dereferenceable(200) %Err, %"class.llvm::LLVMContext"* dereferenceable(8) %Context) #0
+
+declare void @_ZNSs4swapERSs(%"class.std::basic_string"*, %"class.std::basic_string"* dereferenceable(8)) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { inlinehint nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nobuiltin nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #7 = { noreturn nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6.0 (trunk 215115) (llvm/trunk 215117)"}
+!1 = metadata !{metadata !2, metadata !4, i64 0}
+!2 = metadata !{metadata !"_ZTSSs", metadata !3, i64 0}
+!3 = metadata !{metadata !"_ZTSNSs12_Alloc_hiderE", metadata !4, i64 0}
+!4 = metadata !{metadata !"any pointer", metadata !5, i64 0}
+!5 = metadata !{metadata !"omnipotent char", metadata !6, i64 0}
+!6 = metadata !{metadata !"Simple C/C++ TBAA"}
+!7 = metadata !{metadata !8, metadata !9, i64 0}
+!8 = metadata !{metadata !"_ZTSNSs9_Rep_baseE", metadata !9, i64 0, metadata !9, i64 8, metadata !10, i64 16}
+!9 = metadata !{metadata !"long", metadata !5, i64 0}
+!10 = metadata !{metadata !"int", metadata !5, i64 0}
+!11 = metadata !{metadata !12, metadata !12, i64 0}
+!12 = metadata !{metadata !"vtable pointer", metadata !6, i64 0}
+!13 = metadata !{metadata !3, metadata !4, i64 0}
+!14 = metadata !{metadata !15, metadata !10, i64 24}
+!15 = metadata !{metadata !"_ZTSN4llvm12SMDiagnosticE", metadata !4, i64 0, metadata !16, i64 8, metadata !2, i64 16, metadata !10, i64 24, metadata !10, i64 28, metadata !17, i64 32, metadata !2, i64 40, metadata !2, i64 48, metadata !18, i64 56, metadata !19, i64 80}
+!16 = metadata !{metadata !"_ZTSN4llvm5SMLocE", metadata !4, i64 0}
+!17 = metadata !{metadata !"_ZTSN4llvm9SourceMgr8DiagKindE", metadata !5, i64 0}
+!18 = metadata !{metadata !"_ZTSSt6vectorISt4pairIjjESaIS1_EE"}
+!19 = metadata !{metadata !"_ZTSN4llvm11SmallVectorINS_7SMFixItELj4EEE", metadata !20, i64 48}
+!20 = metadata !{metadata !"_ZTSN4llvm18SmallVectorStorageINS_7SMFixItELj4EEE", metadata !5, i64 0}
+!21 = metadata !{metadata !15, metadata !10, i64 28}
+!22 = metadata !{metadata !15, metadata !17, i64 32}
+!23 = metadata !{metadata !24, metadata !4, i64 0}
+!24 = metadata !{metadata !"_ZTSN4llvm15SmallVectorBaseE", metadata !4, i64 0, metadata !4, i64 8, metadata !4, i64 16}
+!25 = metadata !{metadata !24, metadata !4, i64 8}
+!26 = metadata !{metadata !24, metadata !4, i64 16}
+!27 = metadata !{metadata !4, metadata !4, i64 0}
+!28 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!29 = metadata !{metadata !10, metadata !10, i64 0}
+!30 = metadata !{metadata !31, metadata !4, i64 8}
+!31 = metadata !{metadata !"_ZTSN4llvm12MemoryBufferE", metadata !4, i64 8, metadata !4, i64 16}
+!32 = metadata !{metadata !31, metadata !4, i64 16}
+!33 = metadata !{metadata !5, metadata !5, i64 0}
+!34 = metadata !{metadata !35, metadata !4, i64 0}
+!35 = metadata !{metadata !"_ZTSSt12_Vector_baseISt4pairIjjESaIS1_EE", metadata !36, i64 0}
+!36 = metadata !{metadata !"_ZTSNSt12_Vector_baseISt4pairIjjESaIS1_EE12_Vector_implE", metadata !4, i64 0, metadata !4, i64 8, metadata !4, i64 16}
+!37 = metadata !{metadata !38, metadata !38, i64 0}
+!38 = metadata !{metadata !"bool", metadata !5, i64 0}
+!39 = metadata !{i8 0, i8 2}
+!40 = metadata !{metadata !41, metadata !4, i64 0}
+!41 = metadata !{metadata !"_ZTSN4llvm10TimeRegionE", metadata !4, i64 0}
+!42 = metadata !{metadata !43, metadata !44, i64 32}
+!43 = metadata !{metadata !"_ZTSN4llvm11raw_ostreamE", metadata !4, i64 8, metadata !4, i64 16, metadata !4, i64 24, metadata !44, i64 32}
+!44 = metadata !{metadata !"_ZTSN4llvm11raw_ostream10BufferKindE", metadata !5, i64 0}
+!45 = metadata !{metadata !43, metadata !4, i64 24}
+!46 = metadata !{metadata !43, metadata !4, i64 8}
+!47 = metadata !{i64 0, i64 8, metadata !27, i64 8, i64 8, metadata !27}
diff --git a/test/CodeGen/PowerPC/unal-altivec-wint.ll b/test/CodeGen/PowerPC/unal-altivec-wint.ll
new file mode 100644
index 0000000..7e0963f
--- /dev/null
+++ b/test/CodeGen/PowerPC/unal-altivec-wint.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <4 x i32> @llvm.ppc.altivec.lvx(i8*) #1
+
+define <4 x i32> @test1(<4 x i32>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvx(i8* %hv)
+
+  %v0 = load <4 x i32>* %h, align 8
+
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+
+; CHECK-LABEL: @test1
+; CHECK: li [[REG:[0-9]+]], 16
+; CHECK-NOT: li {{[0-9]+}}, 15
+; CHECK-DAG: lvx {{[0-9]+}}, 0, 3
+; CHECK-DAG: lvx {{[0-9]+}}, 3, [[REG]]
+; CHECK: blr
+}
+
+declare void @llvm.ppc.altivec.stvx(<4 x i32>, i8*) #0
+
+define <4 x i32> @test2(<4 x i32>* %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvx(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>* %h, align 8
+
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2
+; CHECK: li [[REG:[0-9]+]], 16
+; CHECK-NOT: li {{[0-9]+}}, 15
+; CHECK-DAG: lvx {{[0-9]+}}, 0, 3
+; CHECK-DAG: lvx {{[0-9]+}}, 3, [[REG]]
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
diff --git a/test/CodeGen/PowerPC/unal4-std.ll b/test/CodeGen/PowerPC/unal4-std.ll
index 9f29e31..e911099 100644
--- a/test/CodeGen/PowerPC/unal4-std.ll
+++ b/test/CodeGen/PowerPC/unal4-std.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -mcpu=pwr7 -mattr=-vsx| FileCheck %s
+; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -22,6 +23,9 @@ if.end210:                                        ; preds = %entry
 ; a multiple of 4).
 ; CHECK: @copy_to_conceal
 ; CHECK: stdx {{[0-9]+}}, 0,
+
+; CHECK-VSX: @copy_to_conceal
+; CHECK-VSX: stxvw4x {{[0-9]+}}, 0,
 }
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/unaligned.ll b/test/CodeGen/PowerPC/unaligned.ll
index d469c62..64c03cd 100644
--- a/test/CodeGen/PowerPC/unaligned.ll
+++ b/test/CodeGen/PowerPC/unaligned.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128-n32"
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128-n32"
 
 define void @foo1(i16* %p, i16* %r) nounwind {
@@ -10,6 +12,10 @@ entry:
 ; CHECK: @foo1
 ; CHECK: lhz
 ; CHECK: sth
+
+; CHECK-VSX: @foo1
+; CHECK-VSX: lhz
+; CHECK-VSX: sth
 }
 
 define void @foo2(i32* %p, i32* %r) nounwind {
@@ -21,6 +27,10 @@ entry:
 ; CHECK: @foo2
 ; CHECK: lwz
 ; CHECK: stw
+
+; CHECK-VSX: @foo2
+; CHECK-VSX: lwz
+; CHECK-VSX: stw
 }
 
 define void @foo3(i64* %p, i64* %r) nounwind {
@@ -32,6 +42,10 @@ entry:
 ; CHECK: @foo3
 ; CHECK: ld
 ; CHECK: std
+
+; CHECK-VSX: @foo3
+; CHECK-VSX: ld
+; CHECK-VSX: std
 }
 
 define void @foo4(float* %p, float* %r) nounwind {
@@ -43,6 +57,10 @@ entry:
 ; CHECK: @foo4
 ; CHECK: lfs
 ; CHECK: stfs
+
+; CHECK-VSX: @foo4
+; CHECK-VSX: lfs
+; CHECK-VSX: stfs
 }
 
 define void @foo5(double* %p, double* %r) nounwind {
@@ -54,6 +72,10 @@ entry:
 ; CHECK: @foo5
 ; CHECK: lfd
 ; CHECK: stfd
+
+; CHECK-VSX: @foo5
+; CHECK-VSX: lxsdx
+; CHECK-VSX: stxsdx
 }
 
 define void @foo6(<4 x float>* %p, <4 x float>* %r) nounwind {
@@ -69,5 +91,15 @@ entry:
 ; CHECK-DAG: ld
 ; CHECK-DAG: stdx
 ; CHECK: stdx
+
+; For VSX on P7, unaligned loads and stores are preferable to aligned
+; stack slots, but lvsl/vperm is better still.  (On P8 lxvw4x is preferable.)
+; Using unaligned stxvw4x is preferable on both machines.
+; CHECK-VSX: @foo6
+; CHECK-VSX-DAG: lvsl
+; CHECK-VSX-DAG: lvx
+; CHECK-VSX-DAG: lvx
+; CHECK-VSX: vperm
+; CHECK-VSX: stxvw4x
 }
 
diff --git a/test/CodeGen/PowerPC/unsafe-math.ll b/test/CodeGen/PowerPC/unsafe-math.ll
index b0bdcc2..f643027 100644
--- a/test/CodeGen/PowerPC/unsafe-math.ll
+++ b/test/CodeGen/PowerPC/unsafe-math.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=ppc32 | grep fmul | count 2
-; RUN: llc < %s -march=ppc32 -enable-unsafe-fp-math | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 | grep fmul | count 2
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -enable-unsafe-fp-math | \
 ; RUN:   grep fmul | count 1
 
 define double @foo(double %X) nounwind {
diff --git a/test/CodeGen/PowerPC/unwind-dw2-g.ll b/test/CodeGen/PowerPC/unwind-dw2-g.ll
index 24b5207..54d3189 100644
--- a/test/CodeGen/PowerPC/unwind-dw2-g.ll
+++ b/test/CodeGen/PowerPC/unwind-dw2-g.ll
@@ -21,15 +21,15 @@ attributes #0 = { nounwind }
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/unwind-dw2.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/unwind-dw2.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"/tmp/unwind-dw2.c", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/unwind-dw2.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\000\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/unwind-dw2.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 !9 = metadata !{i32 2, i32 0, metadata !4, null}
 !10 = metadata !{i32 3, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/PowerPC/varargs-struct-float.ll b/test/CodeGen/PowerPC/varargs-struct-float.ll
index fb1835f..0fd9fc5 100644
--- a/test/CodeGen/PowerPC/varargs-struct-float.ll
+++ b/test/CodeGen/PowerPC/varargs-struct-float.ll
@@ -16,8 +16,8 @@ entry:
   ret void
 }
 
-; CHECK: stfs {{[0-9]+}}, 60(1)
-; CHECK: ld 4, 56(1)
+; CHECK: stfs {{[0-9]+}}, 116(1)
+; CHECK: lwz 4, 116(1)
 ; CHECK: bl
 
 declare void @testvaSf1(i32, ...)
diff --git a/test/CodeGen/PowerPC/vec-abi-align.ll b/test/CodeGen/PowerPC/vec-abi-align.ll
index 3239cf6..5075ff2 100644
--- a/test/CodeGen/PowerPC/vec-abi-align.ll
+++ b/test/CodeGen/PowerPC/vec-abi-align.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -16,6 +17,10 @@ entry:
 ; CHECK-LABEL: @test1
 ; CHECK: stvx 2,
 ; CHECK: blr
+
+; CHECK-VSX-LABEL: @test1
+; CHECK-VSX: stxvw4x 34,
+; CHECK-VSX: blr
 }
 
 ; Function Attrs: nounwind
@@ -35,6 +40,13 @@ entry:
 ; CHECK: addi [[REGB:[0-9]+]], 1, 112
 ; CHECK: lvx 2, [[REGB]], [[REG16]]
 ; CHECK: blr
+
+; CHECK-VSX-LABEL: @test2
+; CHECK-VSX: ld {{[0-9]+}}, 112(1)
+; CHECK-VSX: li [[REG16:[0-9]+]], 16
+; CHECK-VSX: addi [[REGB:[0-9]+]], 1, 112
+; CHECK-VSX: lxvw4x {{[0-9]+}}, [[REGB]], [[REG16]]
+; CHECK-VSX: blr
 }
 
 ; Function Attrs: nounwind
@@ -54,6 +66,13 @@ entry:
 ; CHECK: addi [[REGB:[0-9]+]], 1, 128
 ; CHECK: lvx 2, [[REGB]], [[REG16]]
 ; CHECK: blr
+
+; CHECK-VSX-LABEL: @test3
+; CHECK-VSX: ld {{[0-9]+}}, 128(1)
+; CHECK-VSX: li [[REG16:[0-9]+]], 16
+; CHECK-VSX: addi [[REGB:[0-9]+]], 1, 128
+; CHECK-VSX: lxvw4x {{[0-9]+}}, [[REGB]], [[REG16]]
+; CHECK-VSX: blr
 }
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/PowerPC/vec_cmp.ll b/test/CodeGen/PowerPC/vec_cmp.ll
index 2733089..516b2dd 100644
--- a/test/CodeGen/PowerPC/vec_cmp.ll
+++ b/test/CodeGen/PowerPC/vec_cmp.ll
@@ -63,9 +63,8 @@ entry:
   ret <16 x i8> %sext
 }
 ; CHECK-LABEL:      v16si8_cmp_le:
-; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtsb [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK:      vcmpgtsb [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <16 x i8> @v16ui8_cmp_le(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
 entry:
@@ -74,9 +73,8 @@ entry:
   ret <16 x i8> %sext
 }
 ; CHECK-LABEL:      v16ui8_cmp_le:
-; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtub [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK:      vcmpgtub [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <16 x i8> @v16si8_cmp_lt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
 entry:
@@ -121,9 +119,8 @@ entry:
   ret <16 x i8> %sext
 }
 ; CHECK-LABEL:      v16si8_cmp_ge:
-; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtsb [[RCMPGT:[0-9]+]], 2, 3
-; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+; CHECK:      vcmpgtsb [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <16 x i8> @v16ui8_cmp_ge(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
 entry:
@@ -132,9 +129,8 @@ entry:
   ret <16 x i8> %sext
 }
 ; CHECK-LABEL:      v16ui8_cmp_ge:
-; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtub [[RCMPGT:[0-9]+]], 2, 3
-; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+; CHECK:      vcmpgtub [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 
 define <32 x i8> @v32si8_cmp(<32 x i8> %x, <32 x i8> %y) nounwind readnone {
@@ -193,9 +189,8 @@ entry:
   ret <8 x i16> %sext
 }
 ; CHECK-LABEL:      v8si16_cmp_le:
-; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtsh [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK:      vcmpgtsh [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <8 x i16> @v8ui16_cmp_le(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
 entry:
@@ -204,9 +199,8 @@ entry:
   ret <8 x i16> %sext
 }
 ; CHECK-LABEL:      v8ui16_cmp_le:
-; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtuh [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK:      vcmpgtuh [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <8 x i16> @v8si16_cmp_lt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
 entry:
@@ -251,9 +245,8 @@ entry:
   ret <8 x i16> %sext
 }
 ; CHECK-LABEL:      v8si16_cmp_ge:
-; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtsh [[RCMPGT:[0-9]+]], 2, 3
-; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+; CHECK:      vcmpgtsh [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <8 x i16> @v8ui16_cmp_ge(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
 entry:
@@ -262,9 +255,8 @@ entry:
   ret <8 x i16> %sext
 }
 ; CHECK-LABEL:      v8ui16_cmp_ge:
-; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtuh [[RCMPGT:[0-9]+]], 2, 3
-; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+; CHECK:      vcmpgtuh [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 
 define <16 x i16> @v16si16_cmp(<16 x i16> %x, <16 x i16> %y) nounwind readnone {
@@ -326,9 +318,8 @@ entry:
   ret <4 x i32> %sext
 }
 ; CHECK-LABEL:      v4si32_cmp_le:
-; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtsw [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK:      vcmpgtsw [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <4 x i32> @v4ui32_cmp_le(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
 entry:
@@ -337,9 +328,8 @@ entry:
   ret <4 x i32> %sext
 }
 ; CHECK-LABEL:      v4ui32_cmp_le:
-; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtuw [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK:      vcmpgtuw [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <4 x i32> @v4si32_cmp_lt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
 entry:
@@ -384,9 +374,8 @@ entry:
   ret <4 x i32> %sext
 }
 ; CHECK-LABEL:      v4si32_cmp_ge:
-; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtsw [[RCMPGT:[0-9]+]], 2, 3
-; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+; CHECK:      vcmpgtsw [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <4 x i32> @v4ui32_cmp_ge(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
 entry:
@@ -395,9 +384,8 @@ entry:
   ret <4 x i32> %sext
 }
 ; CHECK-LABEL:      v4ui32_cmp_ge:
-; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtuw [[RCMPGT:[0-9]+]], 2, 3
-; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+; CHECK:      vcmpgtuw [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 
 define <8 x i32> @v8si32_cmp(<8 x i32> %x, <8 x i32> %y) nounwind readnone {
@@ -480,9 +468,7 @@ entry:
   ret <4 x float> %0
 }
 ; CHECK-LABEL:      v4f32_cmp_le:
-; CHECK:      vcmpeqfp [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtfp [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK: vcmpgefp 2, 3, 2
 
 define <4 x float> @v4f32_cmp_lt(<4 x float> %x, <4 x float> %y) nounwind readnone {
 entry:
@@ -514,6 +500,50 @@ entry:
 ; CHECK-LABEL: v4f32_cmp_gt:
 ; CHECK: vcmpgtfp 2, 2, 3
 
+define <4 x float> @v4f32_cmp_ule(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ule <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK-LABEL: v4f32_cmp_ule:
+; CHECK:      vcmpgtfp [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <4 x float> @v4f32_cmp_ult(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ult <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK-LABEL: v4f32_cmp_ult:
+; CHECK:      vcmpgefp [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <4 x float> @v4f32_cmp_uge(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp uge <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK-LABEL: v4f32_cmp_uge:
+; CHECK:      vcmpgtfp [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <4 x float> @v4f32_cmp_ugt(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ugt <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK-LABEL: v4f32_cmp_ugt:
+; CHECK:      vcmpgefp [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
 
 define <8 x float> @v8f32_cmp(<8 x float> %x, <8 x float> %y) nounwind readnone {
 entry:
diff --git a/test/CodeGen/PowerPC/vec_misaligned.ll b/test/CodeGen/PowerPC/vec_misaligned.ll
index 304a84d..73a4a4d 100644
--- a/test/CodeGen/PowerPC/vec_misaligned.ll
+++ b/test/CodeGen/PowerPC/vec_misaligned.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -mattr=-power8-vector | FileCheck %s
 ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE
 
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
diff --git a/test/CodeGen/PowerPC/vec_mul.ll b/test/CodeGen/PowerPC/vec_mul.ll
index 8a44815..86596d4 100644
--- a/test/CodeGen/PowerPC/vec_mul.ll
+++ b/test/CodeGen/PowerPC/vec_mul.ll
@@ -1,6 +1,8 @@
-; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -march=ppc32 -mattr=+altivec | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -march=ppc64 -mattr=+altivec | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -march=ppc64 -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -march=ppc32 -mattr=+altivec -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -march=ppc64 -mattr=+altivec -mattr=-vsx -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -march=ppc64 -mattr=+altivec -mattr=-vsx -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -march=ppc64 -mattr=+altivec -mattr=+vsx -mcpu=pwr7 | FileCheck %s -check-prefix=CHECK-VSX
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -march=ppc64 -mattr=+altivec -mattr=+vsx -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-LE-VSX
 
 define <4 x i32> @test_v4i32(<4 x i32>* %X, <4 x i32>* %Y) {
 	%tmp = load <4 x i32>* %X		; <<4 x i32>> [#uses=1]
@@ -14,6 +16,12 @@ define <4 x i32> @test_v4i32(<4 x i32>* %X, <4 x i32>* %Y) {
 ; CHECK-LE-LABEL: test_v4i32:
 ; CHECK-LE: vmsumuhm
 ; CHECK-LE-NOT: mullw
+; CHECK-VSX-LABEL: test_v4i32:
+; CHECK-VSX: vmsumuhm
+; CHECK-VSX-NOT: mullw
+; CHECK-LE-VSX-LABEL: test_v4i32:
+; CHECK-LE-VSX: vmsumuhm
+; CHECK-LE-VSX-NOT: mullw
 
 define <8 x i16> @test_v8i16(<8 x i16>* %X, <8 x i16>* %Y) {
 	%tmp = load <8 x i16>* %X		; <<8 x i16>> [#uses=1]
@@ -27,6 +35,12 @@ define <8 x i16> @test_v8i16(<8 x i16>* %X, <8 x i16>* %Y) {
 ; CHECK-LE-LABEL: test_v8i16:
 ; CHECK-LE: vmladduhm
 ; CHECK-LE-NOT: mullw
+; CHECK-VSX-LABEL: test_v8i16:
+; CHECK-VSX: vmladduhm
+; CHECK-VSX-NOT: mullw
+; CHECK-LE-VSX-LABEL: test_v8i16:
+; CHECK-LE-VSX: vmladduhm
+; CHECK-LE-VSX-NOT: mullw
 
 define <16 x i8> @test_v16i8(<16 x i8>* %X, <16 x i8>* %Y) {
 	%tmp = load <16 x i8>* %X		; <<16 x i8>> [#uses=1]
@@ -43,6 +57,15 @@ define <16 x i8> @test_v16i8(<16 x i8>* %X, <16 x i8>* %Y) {
 ; CHECK-LE: vmuleub [[REG2:[0-9]+]]
 ; CHECK-LE: vperm {{[0-9]+}}, [[REG2]], [[REG1]]
 ; CHECK-LE-NOT: mullw
+; CHECK-VSX-LABEL: test_v16i8:
+; CHECK-VSX: vmuloub
+; CHECK-VSX: vmuleub
+; CHECK-VSX-NOT: mullw
+; CHECK-LE-VSX-LABEL: test_v16i8:
+; CHECK-LE-VSX: vmuloub [[REG1:[0-9]+]]
+; CHECK-LE-VSX: vmuleub [[REG2:[0-9]+]]
+; CHECK-LE-VSX: vperm {{[0-9]+}}, [[REG2]], [[REG1]]
+; CHECK-LE-VSX-NOT: mullw
 
 define <4 x float> @test_float(<4 x float>* %X, <4 x float>* %Y) {
 	%tmp = load <4 x float>* %X
@@ -61,3 +84,7 @@ define <4 x float> @test_float(<4 x float>* %X, <4 x float>* %Y) {
 ; CHECK-LE: vspltisw [[ZNEG:[0-9]+]], -1
 ; CHECK-LE: vslw     {{[0-9]+}}, [[ZNEG]], [[ZNEG]]
 ; CHECK-LE: vmaddfp
+; CHECK-VSX-LABEL: test_float:
+; CHECK-VSX: xvmulsp
+; CHECK-LE-VSX-LABEL: test_float:
+; CHECK-LE-VSX: xvmulsp
diff --git a/test/CodeGen/PowerPC/vec_shuffle_le.ll b/test/CodeGen/PowerPC/vec_shuffle_le.ll
index 635721c..a4b2119 100644
--- a/test/CodeGen/PowerPC/vec_shuffle_le.ll
+++ b/test/CodeGen/PowerPC/vec_shuffle_le.ll
@@ -6,7 +6,9 @@ entry:
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK: vpkuhum
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vpkuhum [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -27,7 +29,9 @@ entry:
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29>
-; CHECK: vpkuwum
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vpkuwum [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -48,7 +52,9 @@ entry:
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-; CHECK: vmrglb
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vmrglb [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -69,7 +75,9 @@ entry:
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-; CHECK: vmrghb
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vmrghb [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -90,7 +98,9 @@ entry:
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 2, i32 3, i32 18, i32 19, i32 4, i32 5, i32 20, i32 21, i32 6, i32 7, i32 22, i32 23>
-; CHECK: vmrglh
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vmrglh [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -111,7 +121,9 @@ entry:
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 9, i32 24, i32 25, i32 10, i32 11, i32 26, i32 27, i32 12, i32 13, i32 28, i32 29, i32 14, i32 15, i32 30, i32 31>
-; CHECK: vmrghh
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vmrghh [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -132,7 +144,9 @@ entry:
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23>
-; CHECK: vmrglw
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vmrglw [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -153,7 +167,9 @@ entry:
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
-; CHECK: vmrghw
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vmrghw [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -173,8 +189,10 @@ entry:
 ; CHECK: VSLDOI_xy:
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
-; CHECK: vsldoi
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vsldoi [[REG3:[0-9]+]], [[REG2]], [[REG1]], 4
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -183,7 +201,7 @@ define void @VSLDOI_xx(<16 x i8>* %A) {
 entry:
 ; CHECK: VSLDOI_xx:
         %tmp = load <16 x i8>* %A
-        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK: vsldoi
         store <16 x i8> %tmp2, <16 x i8>* %A
         ret void
diff --git a/test/CodeGen/PowerPC/vec_urem_const.ll b/test/CodeGen/PowerPC/vec_urem_const.ll
new file mode 100644
index 0000000..814a826
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_urem_const.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Common code used to replace the urem by a mulhu, and compilation would
+; then crash since mulhu isn't supported on vector types.
+
+define <4 x i32> @test(<4 x i32> %x) {
+entry:
+  %0 = urem <4 x i32> %x, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  ret <4 x i32> %0
+}
diff --git a/test/CodeGen/PowerPC/vrspill.ll b/test/CodeGen/PowerPC/vrspill.ll
index c3d1bf8..b55e129 100644
--- a/test/CodeGen/PowerPC/vrspill.ll
+++ b/test/CodeGen/PowerPC/vrspill.ll
@@ -1,5 +1,6 @@
-; RUN: llc -O0 -mtriple=powerpc-unknown-linux-gnu -mattr=+altivec -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -verify-machineinstrs -fast-isel=false < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=powerpc-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -verify-machineinstrs -fast-isel=false -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -mattr=+vsx -verify-machineinstrs -fast-isel=false -mcpu=pwr7 < %s | FileCheck -check-prefix=CHECK-VSX %s
 
 ; This verifies that we generate correct spill/reload code for vector regs.
 
@@ -15,4 +16,9 @@ entry:
 
 ; CHECK: stvx 2,
 
+; We would prefer to test for "stxvw4x 34," but current -O0 code
+; needlessly generates "vor 3,2,2 / stxvw4x 35,0,3", so we'll settle for
+; the opcode.
+; CHECK-VSX: stxvw4x
+
 declare void @foo(i32*)
diff --git a/test/CodeGen/PowerPC/vsx-div.ll b/test/CodeGen/PowerPC/vsx-div.ll
new file mode 100644
index 0000000..8a9578e
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-div.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mcpu=pwr7 -mattr=+vsx -O1 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+@vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
+@vd = global <2 x double> <double 3.500000e+00, double -7.500000e+00>, align 16
+@vf_res = common global <4 x float> zeroinitializer, align 16
+@vd_res = common global <2 x double> zeroinitializer, align 16
+
+define void @test1() {
+entry:
+  %0 = load <4 x float>* @vf, align 16
+  %1 = tail call <4 x float> @llvm.ppc.vsx.xvdivsp(<4 x float> %0, <4 x float> %0)
+  store <4 x float> %1, <4 x float>* @vf_res, align 16
+  ret void
+}
+; CHECK-LABEL: @test1
+; CHECK: xvdivsp
+
+define void @test2() {
+entry:
+  %0 = load <2 x double>* @vd, align 16
+  %1 = tail call <2 x double> @llvm.ppc.vsx.xvdivdp(<2 x double> %0, <2 x double> %0)
+  store <2 x double> %1, <2 x double>* @vd_res, align 16
+  ret void
+}
+; CHECK-LABEL: @test2
+; CHECK: xvdivdp
+
+declare <2 x double> @llvm.ppc.vsx.xvdivdp(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.ppc.vsx.xvdivsp(<4 x float>, <4 x float>)
diff --git a/test/CodeGen/PowerPC/vsx-fma-m.ll b/test/CodeGen/PowerPC/vsx-fma-m.ll
index da4a204..9dff9a7 100644
--- a/test/CodeGen/PowerPC/vsx-fma-m.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-m.ll
@@ -177,21 +177,27 @@ entry:
   store <2 x double> %1, <2 x double>* %arrayidx3, align 8
   ret void
 
+; Note: There is some unavoidable changeability in this variant.  If the
+; FMAs are reordered differently, the algorithm can pick a different
+; multiplicand to destroy, changing the register assignment.  There isn't
+; a good way to express this possibility, so hopefully this doesn't change
+; too often.
+
 ; CHECK-LABEL: @testv3
 ; CHECK-DAG: xxlor [[V1:[0-9]+]], 34, 34
-; CHECK-DAG: xvmaddmdp 37, 35, 34
 ; CHECK-DAG: li [[C1:[0-9]+]], 48
 ; CHECK-DAG: li [[C2:[0-9]+]], 32
-; CHECK-DAG: xvmaddadp 34, 35, 38
+; CHECK-DAG: xvmaddmdp 37, 35, 34
 ; CHECK-DAG: li [[C3:[0-9]+]], 16
 
 ; Note: We could convert this next FMA to M-type as well, but it would require
 ; re-ordering the instructions.
 ; CHECK-DAG: xvmaddadp [[V1]], 35, 36
 
-; CHECK-DAG: xvmaddmdp 35, 36, 37
+; CHECK-DAG: xvmaddmdp 36, 35, 37
+; CHECK-DAG: xvmaddadp 34, 35, 38
 ; CHECK-DAG: stxvd2x 32, 0, 3
-; CHECK-DAG: stxvd2x 35, 3, [[C1]]
+; CHECK-DAG: stxvd2x 36, 3, [[C1]]
 ; CHECK-DAG: stxvd2x 34, 3, [[C2]]
 ; CHECK-DAG: stxvd2x 37, 3, [[C3]]
 ; CHECK: blr
diff --git a/test/CodeGen/PowerPC/vsx-ldst.ll b/test/CodeGen/PowerPC/vsx-ldst.ll
new file mode 100644
index 0000000..0c9ebef
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-ldst.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64-unknown-linux-gnu < %s > %t
+; RUN: grep lxvw4x < %t | count 3
+; RUN: grep lxvd2x < %t | count 3
+; RUN: grep stxvw4x < %t | count 3
+; RUN: grep stxvd2x < %t | count 3
+
+@vsi = global <4 x i32> <i32 -1, i32 2, i32 -3, i32 4>, align 16
+@vui = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
+@vsll = global <2 x i64> <i64 255, i64 -937>, align 16
+@vull = global <2 x i64> <i64 1447, i64 2894>, align 16
+@vd = global <2 x double> <double 3.500000e+00, double -7.500000e+00>, align 16
+@res_vsi = common global <4 x i32> zeroinitializer, align 16
+@res_vui = common global <4 x i32> zeroinitializer, align 16
+@res_vf = common global <4 x float> zeroinitializer, align 16
+@res_vsll = common global <2 x i64> zeroinitializer, align 16
+@res_vull = common global <2 x i64> zeroinitializer, align 16
+@res_vd = common global <2 x double> zeroinitializer, align 16
+
+; Function Attrs: nounwind
+define void @test1() {
+entry:
+  %0 = load <4 x i32>* @vsi, align 16
+  %1 = load <4 x i32>* @vui, align 16
+  %2 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 16
+  %3 = load <2 x double>* bitcast (<2 x i64>* @vsll to <2 x double>*), align 16
+  %4 = load <2 x double>* bitcast (<2 x i64>* @vull to <2 x double>*), align 16
+  %5 = load <2 x double>* @vd, align 16
+  store <4 x i32> %0, <4 x i32>* @res_vsi, align 16
+  store <4 x i32> %1, <4 x i32>* @res_vui, align 16
+  store <4 x i32> %2, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 16
+  store <2 x double> %3, <2 x double>* bitcast (<2 x i64>* @res_vsll to <2 x double>*), align 16
+  store <2 x double> %4, <2 x double>* bitcast (<2 x i64>* @res_vull to <2 x double>*), align 16
+  store <2 x double> %5, <2 x double>* @res_vd, align 16
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/vsx-minmax.ll b/test/CodeGen/PowerPC/vsx-minmax.ll
new file mode 100644
index 0000000..47f50ab
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-minmax.ll
@@ -0,0 +1,98 @@
+; RUN: llc -mcpu=pwr7 -mattr=+vsx -O0 -fast-isel=0 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
+@vd = global <2 x double> <double 3.500000e+00, double -7.500000e+00>, align 16
+@d = global double 2.340000e+01, align 8
+@vf1 = common global <4 x float> zeroinitializer, align 16
+@vd1 = common global <2 x double> zeroinitializer, align 16
+@vf2 = common global <4 x float> zeroinitializer, align 16
+@vf3 = common global <4 x float> zeroinitializer, align 16
+@vd2 = common global <2 x double> zeroinitializer, align 16
+@vf4 = common global <4 x float> zeroinitializer, align 16
+@d1 = common global double 0.000000e+00, align 8
+@d2 = common global double 0.000000e+00, align 8
+
+; Function Attrs: nounwind
+define void @test1() #0 {
+; CHECK-LABEL: @test1
+entry:
+  %0 = load volatile <4 x float>* @vf, align 16
+  %1 = load volatile <4 x float>* @vf, align 16
+  %2 = tail call <4 x float> @llvm.ppc.vsx.xvmaxsp(<4 x float> %0, <4 x float> %1)
+; CHECK: xvmaxsp
+  store <4 x float> %2, <4 x float>* @vf1, align 16
+  %3 = load <2 x double>* @vd, align 16
+  %4 = tail call <2 x double> @llvm.ppc.vsx.xvmaxdp(<2 x double> %3, <2 x double> %3)
+; CHECK: xvmaxdp
+  store <2 x double> %4, <2 x double>* @vd1, align 16
+  %5 = load volatile <4 x float>* @vf, align 16
+  %6 = load volatile <4 x float>* @vf, align 16
+  %7 = tail call <4 x float> @llvm.ppc.vsx.xvmaxsp(<4 x float> %5, <4 x float> %6)
+; CHECK: xvmaxsp
+  store <4 x float> %7, <4 x float>* @vf2, align 16
+  %8 = load volatile <4 x float>* @vf, align 16
+  %9 = load volatile <4 x float>* @vf, align 16
+  %10 = tail call <4 x float> @llvm.ppc.vsx.xvminsp(<4 x float> %8, <4 x float> %9)
+; CHECK: xvminsp
+  store <4 x float> %10, <4 x float>* @vf3, align 16
+  %11 = load <2 x double>* @vd, align 16
+  %12 = tail call <2 x double> @llvm.ppc.vsx.xvmindp(<2 x double> %11, <2 x double> %11)
+; CHECK: xvmindp
+  store <2 x double> %12, <2 x double>* @vd2, align 16
+  %13 = load volatile <4 x float>* @vf, align 16
+  %14 = load volatile <4 x float>* @vf, align 16
+  %15 = tail call <4 x float> @llvm.ppc.vsx.xvminsp(<4 x float> %13, <4 x float> %14)
+; CHECK: xvminsp
+  store <4 x float> %15, <4 x float>* @vf4, align 16
+  %16 = load double* @d, align 8
+  %17 = tail call double @llvm.ppc.vsx.xsmaxdp(double %16, double %16)
+; CHECK: xsmaxdp
+  store double %17, double* @d1, align 8
+  %18 = tail call double @llvm.ppc.vsx.xsmindp(double %16, double %16)
+; CHECK: xsmindp
+  store double %18, double* @d2, align 8
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare double @llvm.ppc.vsx.xsmaxdp(double, double)
+
+; Function Attrs: nounwind readnone
+declare double @llvm.ppc.vsx.xsmindp(double, double)
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.ppc.vsx.xvminsp(<4 x float>, <4 x float>)
+
+; Function Attrs: nounwind readnone
+declare <2 x double> @llvm.ppc.vsx.xvmindp(<2 x double>, <2 x double>)
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.ppc.vsx.xvmaxsp(<4 x float>, <4 x float>)
+
+; Function Attrs: nounwind readnone
+declare <2 x double> @llvm.ppc.vsx.xvmaxdp(<2 x double>, <2 x double>)
+
+; Generated from C source:
+
+; % clang -O1 -maltivec -mvsx -S -emit-llvm vsx-minmax.c
+;
+;volatile vector float vf = { -1.5, 2.5, -3.5, 4.5 };
+;vector double vd = { 3.5, -7.5 };
+;double d = 23.4;
+;
+;vector float vf1, vf2, vf3, vf4;
+;vector double vd1, vd2;
+;double d1, d2;
+;
+;void test1() {
+;  vf1 = vec_max(vf, vf);
+;  vd1 = vec_max(vd, vd);
+;  vf2 = vec_vmaxfp(vf, vf);
+;  vf3 = vec_min(vf, vf);
+;  vd2 = vec_min(vd, vd);
+;  vf4 = vec_vminfp(vf, vf);
+;  d1 = __builtin_vsx_xsmaxdp(d, d);
+;  d2 = __builtin_vsx_xsmindp(d, d);
+;}
diff --git a/test/CodeGen/PowerPC/vsx-p8.ll b/test/CodeGen/PowerPC/vsx-p8.ll
new file mode 100644
index 0000000..81406b6
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-p8.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mcpu=pwr8 -mattr=+power8-vector < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Unaligned loads/stores on P8 and later should use VSX where possible.
+
+define <2 x double> @test28u(<2 x double>* %a) {
+  %v = load <2 x double>* %a, align 8
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test28u
+; CHECK: lxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test29u(<2 x double>* %a, <2 x double> %b) {
+  store <2 x double> %b, <2 x double>* %a, align 8
+  ret void
+
+; CHECK-LABEL: @test29u
+; CHECK: stxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define <4 x float> @test32u(<4 x float>* %a) {
+  %v = load <4 x float>* %a, align 8
+  ret <4 x float> %v
+
+; CHECK-LABEL: @test32u
+; CHECK: lxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test33u(<4 x float>* %a, <4 x float> %b) {
+  store <4 x float> %b, <4 x float>* %a, align 8
+  ret void
+
+; CHECK-LABEL: @test33u
+; CHECK: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
diff --git a/test/CodeGen/PowerPC/vsx.ll b/test/CodeGen/PowerPC/vsx.ll
index f5ac577..333b75a 100644
--- a/test/CodeGen/PowerPC/vsx.ll
+++ b/test/CodeGen/PowerPC/vsx.ll
@@ -356,6 +356,63 @@ define void @test31(<2 x i64>* %a, <2 x i64> %b) {
 ; CHECK: blr
 }
 
+define <4 x float> @test32(<4 x float>* %a) {
+  %v = load <4 x float>* %a, align 16
+  ret <4 x float> %v
+
+; CHECK-LABEL: @test32
+; CHECK: lxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test33(<4 x float>* %a, <4 x float> %b) {
+  store <4 x float> %b, <4 x float>* %a, align 16
+  ret void
+
+; CHECK-LABEL: @test33
+; CHECK: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define <4 x float> @test32u(<4 x float>* %a) {
+  %v = load <4 x float>* %a, align 8
+  ret <4 x float> %v
+
+; CHECK-LABEL: @test32u
+; CHECK-DAG: lvsl
+; CHECK-DAG: lvx
+; CHECK-DAG: lvx
+; CHECK: vperm 2,
+; CHECK: blr
+}
+
+define void @test33u(<4 x float>* %a, <4 x float> %b) {
+  store <4 x float> %b, <4 x float>* %a, align 8
+  ret void
+
+; CHECK-LABEL: @test33u
+; CHECK: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define <4 x i32> @test34(<4 x i32>* %a) {
+  %v = load <4 x i32>* %a, align 16
+  ret <4 x i32> %v
+
+; CHECK-LABEL: @test34
+; CHECK: lxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test35(<4 x i32>* %a, <4 x i32> %b) {
+  store <4 x i32> %b, <4 x i32>* %a, align 16
+  ret void
+
+; CHECK-LABEL: @test35
+; CHECK: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
 define <2 x double> @test40(<2 x i64> %a) {
   %v = uitofp <2 x i64> %a to <2 x double>
   ret <2 x double> %v
@@ -634,7 +691,7 @@ define <2 x i32> @test80(i32 %v) {
 ; CHECK-DAG: addi [[R1:[0-9]+]], 3, 3
 ; CHECK-DAG: addi [[R2:[0-9]+]], 1, -16
 ; CHECK-DAG: addi [[R3:[0-9]+]], 3, 2
-; CHECK: std [[R1]], 8([[R2]])
+; CHECK: std [[R1]], -8(1)
 ; CHECK: std [[R3]], -16(1)
 ; CHECK: lxvd2x 34, 0, [[R2]]
 ; CHECK-NOT: stxvd2x
@@ -649,3 +706,14 @@ define <2 x double> @test81(<4 x float> %b) {
 ; CHECK: blr
 }
 
+define double @test82(double %a, double %b, double %c, double %d) {
+entry:
+  %m = fcmp oeq double %c, %d
+  %v = select i1 %m, double %a, double %b
+  ret double %v
+
+; CHECK-LABEL: @test82
+; CHECK: xscmpudp [[REG:[0-9]+]], 3, 4
+; CHECK: beqlr [[REG]]
+}
+
diff --git a/test/CodeGen/R600/128bit-kernel-args.ll b/test/CodeGen/R600/128bit-kernel-args.ll
index 3c4fcf7..d9b0ff2 100644
--- a/test/CodeGen/R600/128bit-kernel-args.ll
+++ b/test/CodeGen/R600/128bit-kernel-args.ll
@@ -1,27 +1,27 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; R600-CHECK: @v4i32_kernel_arg
+; R600-CHECK: {{^}}v4i32_kernel_arg:
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Y, KC0[3].Z
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Z, KC0[3].W
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].W, KC0[4].X
-; SI-CHECK: @v4i32_kernel_arg
-; SI-CHECK: BUFFER_STORE_DWORDX4
+; SI-CHECK: {{^}}v4i32_kernel_arg:
+; SI-CHECK: buffer_store_dwordx4
 define void @v4i32_kernel_arg(<4 x i32> addrspace(1)* %out, <4 x i32>  %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK: @v4f32_kernel_arg
+; R600-CHECK: {{^}}v4f32_kernel_arg:
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Y, KC0[3].Z
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Z, KC0[3].W
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].W, KC0[4].X
-; SI-CHECK: @v4f32_kernel_arg
-; SI-CHECK: BUFFER_STORE_DWORDX4
-define void @v4f32_kernel_args(<4 x float> addrspace(1)* %out, <4 x float>  %in) {
+; SI-CHECK: {{^}}v4f32_kernel_arg:
+; SI-CHECK: buffer_store_dwordx4
+define void @v4f32_kernel_arg(<4 x float> addrspace(1)* %out, <4 x float>  %in) {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/32-bit-local-address-space.ll b/test/CodeGen/R600/32-bit-local-address-space.ll
index 7dec426..4ff2762 100644
--- a/test/CodeGen/R600/32-bit-local-address-space.ll
+++ b/test/CodeGen/R600/32-bit-local-address-space.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
 ; the global address space(1) uses 64-bit pointers.  These tests check to make sure
@@ -9,9 +9,9 @@
 ; Instructions with B32, U32, and I32 in their name take 32-bit operands, while
 ; instructions with B64, U64, and I64 take 64-bit operands.
 
-; CHECK-LABEL: @local_address_load
-; CHECK: V_MOV_B32_e{{32|64}} [[PTR:v[0-9]]]
-; CHECK: DS_READ_B32 v{{[0-9]+}}, [[PTR]]
+; FUNC-LABEL: {{^}}local_address_load:
+; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
+; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
 define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = load i32 addrspace(3)* %in
@@ -19,10 +19,10 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @local_address_gep
-; CHECK: S_ADD_I32 [[SPTR:s[0-9]]]
-; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; CHECK: DS_READ_B32 [[VPTR]]
+; FUNC-LABEL: {{^}}local_address_gep:
+; SI: s_add_i32 [[SPTR:s[0-9]]]
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_read_b32 [[VPTR]]
 define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
 entry:
   %0 = getelementptr i32 addrspace(3)* %in, i32 %offset
@@ -31,9 +31,9 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @local_address_gep_const_offset
-; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
-; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VPTR]], 0x4,
+; FUNC-LABEL: {{^}}local_address_gep_const_offset:
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
 define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(3)* %in, i32 1
@@ -43,10 +43,10 @@ entry:
 }
 
 ; Offset too large, can't fold into 16-bit immediate offset.
-; CHECK-LABEL: @local_address_gep_large_const_offset
-; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
-; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; CHECK: DS_READ_B32 [[VPTR]]
+; FUNC-LABEL: {{^}}local_address_gep_large_const_offset:
+; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_read_b32 [[VPTR]]
 define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(3)* %in, i32 16385
@@ -55,10 +55,10 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @null_32bit_lds_ptr:
-; CHECK: V_CMP_NE_I32
-; CHECK-NOT: V_CMP_NE_I32
-; CHECK: V_CNDMASK_B32
+; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
+; SI: v_cmp_ne_i32
+; SI-NOT: v_cmp_ne_i32
+; SI: v_cndmask_b32
 define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
   %cmp = icmp ne i32 addrspace(3)* %lds, null
   %x = select i1 %cmp, i32 123, i32 456
@@ -66,10 +66,10 @@ define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds)
   ret void
 }
 
-; CHECK-LABEL: @mul_32bit_ptr:
-; CHECK: V_MUL_LO_I32
-; CHECK-NEXT: V_ADD_I32_e32
-; CHECK-NEXT: DS_READ_B32
+; FUNC-LABEL: {{^}}mul_32bit_ptr:
+; SI: s_mul_i32
+; SI-NEXT: s_add_i32
+; SI: ds_read_b32
 define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
   %ptr = getelementptr [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
   %val = load float addrspace(3)* %ptr
@@ -77,11 +77,11 @@ define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %
   ret void
 }
 
-@g_lds = addrspace(3) global float zeroinitializer, align 4
+@g_lds = addrspace(3) global float undef, align 4
 
-; CHECK-LABEL: @infer_ptr_alignment_global_offset:
-; CHECK: V_MOV_B32_e32 [[REG:v[0-9]+]], 0
-; CHECK: DS_READ_B32 v{{[0-9]+}}, [[REG]]
+; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
 define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
   %val = load float addrspace(3)* @g_lds
   store float %val, float addrspace(1)* %out
@@ -89,37 +89,37 @@ define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %ti
 }
 
 
-@ptr = addrspace(3) global i32 addrspace(3)* null
-@dst = addrspace(3) global [16384 x i32] zeroinitializer
+@ptr = addrspace(3) global i32 addrspace(3)* undef
+@dst = addrspace(3) global [16384 x i32] undef
 
-; CHECK-LABEL: @global_ptr:
-; CHECK: DS_WRITE_B32
+; FUNC-LABEL: {{^}}global_ptr:
+; SI: ds_write_b32
 define void @global_ptr() nounwind {
   store i32 addrspace(3)* getelementptr ([16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
   ret void
 }
 
-; CHECK-LABEL: @local_address_store
-; CHECK: DS_WRITE_B32
+; FUNC-LABEL: {{^}}local_address_store:
+; SI: ds_write_b32
 define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
   store i32 %val, i32 addrspace(3)* %out
   ret void
 }
 
-; CHECK-LABEL: @local_address_gep_store
-; CHECK: S_ADD_I32 [[SADDR:s[0-9]+]],
-; CHECK: V_MOV_B32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
-; CHECK: DS_WRITE_B32 [[ADDR]], v{{[0-9]+}},
+; FUNC-LABEL: {{^}}local_address_gep_store:
+; SI: s_add_i32 [[SADDR:s[0-9]+]],
+; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
+; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
 define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
   %gep = getelementptr i32 addrspace(3)* %out, i32 %offset
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
 }
 
-; CHECK-LABEL: @local_address_gep_const_offset_store
-; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
-; CHECK: V_MOV_B32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
-; CHECK: DS_WRITE_B32 [[VPTR]], [[VAL]], 0x4
+; FUNC-LABEL: {{^}}local_address_gep_const_offset_store:
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
+; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
 define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32 addrspace(3)* %out, i32 1
   store i32 %val, i32 addrspace(3)* %gep, align 4
@@ -127,10 +127,10 @@ define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %v
 }
 
 ; Offset too large, can't fold into 16-bit immediate offset.
-; CHECK-LABEL: @local_address_gep_large_const_offset_store
-; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
-; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; CHECK: DS_WRITE_B32 [[VPTR]], v{{[0-9]+}}, 0
+; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store:
+; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_write_b32 [[VPTR]], v{{[0-9]+}} [M0]{{$}}
 define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32 addrspace(3)* %out, i32 16385
   store i32 %val, i32 addrspace(3)* %gep, align 4
diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
index 2d82c1e..cf4e055 100644
--- a/test/CodeGen/R600/64bit-kernel-args.ll
+++ b/test/CodeGen/R600/64bit-kernel-args.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; SI-CHECK: @f64_kernel_arg
-; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
-; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
-; SI-CHECK: BUFFER_STORE_DWORDX2
+; SI-CHECK: {{^}}f64_kernel_arg:
+; SI-CHECK-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
+; SI-CHECK-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
+; SI-CHECK: buffer_store_dwordx2
 define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
   store double %in, double addrspace(1)* %out
diff --git a/test/CodeGen/R600/add-debug.ll b/test/CodeGen/R600/add-debug.ll
new file mode 100644
index 0000000..166e0f6
--- /dev/null
+++ b/test/CodeGen/R600/add-debug.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=tahiti -debug
+; REQUIRES: asserts
+
+; Check that SelectionDAGDumper does not crash on int_SI_if.
+define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/add.ll b/test/CodeGen/R600/add.ll
index 711a2bc..767a642 100644
--- a/test/CodeGen/R600/add.ll
+++ b/test/CodeGen/R600/add.ll
@@ -1,12 +1,12 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK --check-prefix=FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s
 
-;FUNC-LABEL: @test1:
+;FUNC-LABEL: {{^}}test1:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: V_ADD_I32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
 ;SI-CHECK-NOT: [[REG]]
-;SI-CHECK: BUFFER_STORE_DWORD [[REG]],
+;SI-CHECK: buffer_store_dword [[REG]],
 define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
   %a = load i32 addrspace(1)* %in
@@ -16,12 +16,12 @@ define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
-;FUNC-LABEL: @test2:
+;FUNC-LABEL: {{^}}test2:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -32,16 +32,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;FUNC-LABEL: @test4:
+;FUNC-LABEL: {{^}}test4:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -52,7 +52,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   ret void
 }
 
-; FUNC-LABEL: @test8
+; FUNC-LABEL: {{^}}test8:
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
@@ -61,14 +61,14 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
 define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
 entry:
   %0 = add <8 x i32> %a, %b
@@ -76,7 +76,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @test16
+; FUNC-LABEL: {{^}}test16:
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
@@ -93,22 +93,22 @@ entry:
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
 define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
 entry:
   %0 = add <16 x i32> %a, %b
@@ -116,9 +116,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @add64
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADDC_U32
+; FUNC-LABEL: {{^}}add64:
+; SI-CHECK: s_add_u32
+; SI-CHECK: s_addc_u32
 define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = add i64 %a, %b
@@ -126,13 +126,13 @@ entry:
   ret void
 }
 
-; The V_ADDC_U32 and V_ADD_I32 instruction can't read SGPRs, because they
+; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they
 ; use VCC.  The test is designed so that %a will be stored in an SGPR and
 ; %0 will be stored in a VGPR, so the comiler will be forced to copy %a
 ; to a VGPR before doing the add.
 
-; FUNC-LABEL: @add64_sgpr_vgpr
-; SI-CHECK-NOT: V_ADDC_U32_e32 s
+; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
+; SI-CHECK-NOT: v_addc_u32_e32 s
 define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64 addrspace(1)* %in
@@ -141,12 +141,10 @@ entry:
   ret void
 }
 
-; Test i64 add inside a branch.  We don't allow SALU instructions inside of
-; branches.
-; FIXME: We are being conservative here.  We could allow this in some cases.
-; FUNC-LABEL: @add64_in_branch
-; SI-CHECK-NOT: S_ADD_I32
-; SI-CHECK-NOT: S_ADDC_U32
+; Test i64 add inside a branch.
+; FUNC-LABEL: {{^}}add64_in_branch:
+; SI-CHECK: s_add_u32
+; SI-CHECK: s_addc_u32
 define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll
index f733d90..47ecf6d 100644
--- a/test/CodeGen/R600/add_i64.ll
+++ b/test/CodeGen/R600/add_i64.ll
@@ -3,9 +3,9 @@
 
 declare i32 @llvm.r600.read.tidig.x() readnone
 
-; SI-LABEL: @test_i64_vreg:
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
+; SI-LABEL: {{^}}test_i64_vreg:
+; SI: v_add_i32
+; SI: v_addc_u32
 define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr i64 addrspace(1)* %inA, i32 %tid
@@ -18,9 +18,9 @@ define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa
 }
 
 ; Check that the SGPR add operand is correctly moved to a VGPR.
-; SI-LABEL: @sgpr_operand:
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
+; SI-LABEL: {{^}}sgpr_operand:
+; SI: v_add_i32
+; SI: v_addc_u32
 define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
   %foo = load i64 addrspace(1)* %in, align 8
   %result = add i64 %foo, %a
@@ -31,9 +31,9 @@ define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noal
 ; Swap the arguments. Check that the SGPR -> VGPR copy works with the
 ; SGPR as other operand.
 ;
-; SI-LABEL: @sgpr_operand_reversed:
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
+; SI-LABEL: {{^}}sgpr_operand_reversed:
+; SI: v_add_i32
+; SI: v_addc_u32
 define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
   %foo = load i64 addrspace(1)* %in, align 8
   %result = add i64 %a, %foo
@@ -42,22 +42,22 @@ define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace
 }
 
 
-; SI-LABEL: @test_v2i64_sreg:
-; SI: S_ADD_I32
-; SI: S_ADDC_U32
-; SI: S_ADD_I32
-; SI: S_ADDC_U32
+; SI-LABEL: {{^}}test_v2i64_sreg:
+; SI: s_add_u32
+; SI: s_addc_u32
+; SI: s_add_u32
+; SI: s_addc_u32
 define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
   %result = add <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: @test_v2i64_vreg:
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
+; SI-LABEL: {{^}}test_v2i64_vreg:
+; SI: v_add_i32
+; SI: v_addc_u32
+; SI: v_add_i32
+; SI: v_addc_u32
 define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr <2 x i64> addrspace(1)* %inA, i32 %tid
@@ -69,13 +69,13 @@ define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> add
   ret void
 }
 
-; SI-LABEL: @trunc_i64_add_to_i32
-; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]]
-; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]]
-; SI: S_ADD_I32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
-; SI-NOT: ADDC
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI-LABEL: {{^}}trunc_i64_add_to_i32:
+; SI: s_load_dword s[[SREG0:[0-9]+]]
+; SI: s_load_dword s[[SREG1:[0-9]+]]
+; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
+; SI-NOT: addc
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
 define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = add i64 %b, %a
   %trunc = trunc i64 %add to i32
diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll
index f75a8ac..d04afe6 100644
--- a/test/CodeGen/R600/address-space.ll
+++ b/test/CodeGen/R600/address-space.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
 
 ; Test that codegenprepare understands address space sizes
 
@@ -7,11 +7,11 @@
 ; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is
 ; already in a VGPR after the first read.
 
-; CHECK-LABEL: @do_as_ptr_calcs:
-; CHECK: S_LOAD_DWORD [[SREG1:s[0-9]+]],
-; CHECK: V_MOV_B32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
-; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 0x14
-; CHECK: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 0xc
+; CHECK-LABEL: {{^}}do_as_ptr_calcs:
+; CHECK: s_load_dword [[SREG1:s[0-9]+]],
+; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
+; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
+; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:20
 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
diff --git a/test/CodeGen/R600/and.ll b/test/CodeGen/R600/and.ll
index cf11481..9a76fce 100644
--- a/test/CodeGen/R600/and.ll
+++ b/test/CodeGen/R600/and.ll
@@ -1,12 +1,12 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @test2
+; FUNC-LABEL: {{^}}test2:
 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -17,16 +17,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-; FUNC-LABEL: @test4
+; FUNC-LABEL: {{^}}test4:
 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -37,24 +37,24 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   ret void
 }
 
-; FUNC-LABEL: @s_and_i32
-; SI: S_AND_B32
+; FUNC-LABEL: {{^}}s_and_i32:
+; SI: s_and_b32
 define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, %b
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @s_and_constant_i32
-; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
+; FUNC-LABEL: {{^}}s_and_constant_i32:
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
 define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @v_and_i32
-; SI: V_AND_B32
+; FUNC-LABEL: {{^}}v_and_i32:
+; SI: v_and_b32
 define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
   %a = load i32 addrspace(1)* %aptr, align 4
   %b = load i32 addrspace(1)* %bptr, align 4
@@ -63,8 +63,8 @@ define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addr
   ret void
 }
 
-; FUNC-LABEL: @v_and_constant_i32
-; SI: V_AND_B32
+; FUNC-LABEL: {{^}}v_and_constant_i32:
+; SI: v_and_b32
 define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 1234567
@@ -72,25 +72,34 @@ define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr)
   ret void
 }
 
-; FUNC-LABEL: @s_and_i64
-; SI: S_AND_B64
+; FUNC-LABEL: {{^}}s_and_i64:
+; SI: s_and_b64
 define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and = and i64 %a, %b
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @s_and_constant_i64
-; SI: S_AND_B64
+; FIXME: Should use SGPRs
+; FUNC-LABEL: {{^}}s_and_i1:
+; SI: v_and_b32
+define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
+  %and = and i1 %a, %b
+  store i1 %and, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_constant_i64:
+; SI: s_and_b64
 define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 281474976710655
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @v_and_i64
-; SI: V_AND_B32
-; SI: V_AND_B32
+; FUNC-LABEL: {{^}}v_and_i64:
+; SI: v_and_b32
+; SI: v_and_b32
 define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
   %a = load i64 addrspace(1)* %aptr, align 8
   %b = load i64 addrspace(1)* %bptr, align 8
@@ -99,12 +108,51 @@ define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addr
   ret void
 }
 
-; FUNC-LABEL: @v_and_constant_i64
-; SI: V_AND_B32
-; SI: V_AND_B32
+; FUNC-LABEL: {{^}}v_and_i64_br:
+; SI: v_and_b32
+; SI: v_and_b32
+define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) {
+entry:
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %endif
+
+if:
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %b = load i64 addrspace(1)* %bptr, align 8
+  %and = and i64 %a, %b
+  br label %endif
+
+endif:
+  %tmp1 = phi i64 [%and, %if], [0, %entry]
+  store i64 %tmp1, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_constant_i64:
+; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FIXME: Replace and 0 with mov 0
+; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
+; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
+; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
+define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %and = and i64 %a, 64
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_i64:
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64
+define void @s_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 64
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/anyext.ll b/test/CodeGen/R600/anyext.ll
index bbe5d0a..23fdcbb 100644
--- a/test/CodeGen/R600/anyext.ll
+++ b/test/CodeGen/R600/anyext.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-; CHECK-LABEL: @anyext_i1_i32
-; CHECK: V_CNDMASK_B32_e64
+; CHECK-LABEL: {{^}}anyext_i1_i32:
+; CHECK: v_cndmask_b32_e64
 define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0
diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll
index 3230353..84d3540 100644
--- a/test/CodeGen/R600/array-ptr-calc-i32.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i32.ll
@@ -1,4 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
 declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
@@ -8,14 +9,21 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
 ; 64-bit pointer add. This should work since private pointers should
 ; be 32-bits.
 
-; SI-LABEL: @test_private_array_ptr_calc:
-; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
+; SI-LABEL: {{^}}test_private_array_ptr_calc:
+
+; FIXME: We end up with zero argument for ADD, because
+; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
+; with the appropriate offset.  We should fold this into the store.
+; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}}
+; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}]
 ;
 ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
 ; alloca to a vector.  It currently fails because it does not know how
 ; to interpret:
 ; getelementptr [4 x i32]* %alloca, i32 1, i32 %b
-; SI: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]]
+
+; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], 16
+; SI-PROMOTE: ds_write_b32 [[PTRREG]]
 define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %alloca = alloca [4 x i32], i32 4, align 16
   %tid = call i32 @llvm.SI.tid() readnone
diff --git a/test/CodeGen/R600/array-ptr-calc-i64.ll b/test/CodeGen/R600/array-ptr-calc-i64.ll
index e254c5f..75f6394 100644
--- a/test/CodeGen/R600/array-ptr-calc-i64.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i64.ll
@@ -1,13 +1,13 @@
-; XFAIL: *
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() readnone
 
-
-; SI-LABEL: @test_array_ptr_calc(
-define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [16 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+; SI-LABEL: {{^}}test_array_ptr_calc:
+; SI: v_mul_lo_i32
+; SI: v_mul_hi_i32
+define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.SI.tid() readnone
-  %a_ptr = getelementptr [16 x i32] addrspace(1)* %inA, i32 1, i32 %tid
+  %a_ptr = getelementptr [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
   %b_ptr = getelementptr i32 addrspace(1)* %inB, i32 %tid
   %a = load i32 addrspace(1)* %a_ptr
   %b = load i32 addrspace(1)* %b_ptr
@@ -15,4 +15,3 @@ define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [16 x i32] addr
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
-
diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll
index eb9539e..223f4d3 100644
--- a/test/CodeGen/R600/atomic_cmp_swap_local.ll
+++ b/test/CodeGen/R600/atomic_cmp_swap_local.ll
@@ -1,13 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=CI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i32_offset:
-; SI: S_LOAD_DWORD [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: V_MOV_B32_e32 [[VCMP:v[0-9]+]], 7
-; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI-DAG: V_MOV_B32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; SI: DS_CMPST_RTN_B32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]], 0x10, [M0]
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; SI: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 [M0]
+; SI: s_endpgm
 define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
@@ -16,18 +17,18 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i64_offset:
-; SI: S_LOAD_DWORDX2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: S_MOV_B64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
-; SI-DAG: V_MOV_B32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
-; SI-DAG: V_MOV_B32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
-; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI-DAG: V_MOV_B32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; SI-DAG: V_MOV_B32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; SI: DS_CMPST_RTN_B64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}}, 0x20, [M0]
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: s_mov_b64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
+; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
+; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
+; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; SI: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 [M0]
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
 define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
@@ -35,3 +36,50 @@ define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrs
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
+; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 [M0]
+; SI: s_endpgm
+define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
+; SI-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; SI: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 [M0]
+; SI: s_endpgm
+define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_mov_b64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
+; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
+; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
+; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; SI: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 [M0]
+; SI: s_endpgm
+define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
+  %result = extractvalue { i64, i1 } %pair, 0
+  ret void
+}
diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll
index c26f9cd..f0eff21 100644
--- a/test/CodeGen/R600/atomic_load_add.ll
+++ b/test/CodeGen/R600/atomic_load_add.ll
@@ -1,35 +1,35 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; FUNC-LABEL: @atomic_add_local
+; FUNC-LABEL: {{^}}atomic_add_local:
 ; R600: LDS_ADD *
-; SI: DS_ADD_RTN_U32
+; SI: ds_add_u32
 define void @atomic_add_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
 
-; FUNC-LABEL: @atomic_add_local_const_offset
+; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
 ; R600: LDS_ADD *
-; SI: DS_ADD_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
 }
 
-; FUNC-LABEL: @atomic_add_ret_local
+; FUNC-LABEL: {{^}}atomic_add_ret_local:
 ; R600: LDS_ADD_RET *
-; SI: DS_ADD_RTN_U32
+; SI: ds_add_rtn_u32
 define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @atomic_add_ret_local_const_offset
+; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
 ; R600: LDS_ADD_RET *
-; SI: DS_ADD_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x14
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
 define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
diff --git a/test/CodeGen/R600/atomic_load_sub.ll b/test/CodeGen/R600/atomic_load_sub.ll
index 3569d91..61ff296 100644
--- a/test/CodeGen/R600/atomic_load_sub.ll
+++ b/test/CodeGen/R600/atomic_load_sub.ll
@@ -1,35 +1,35 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @atomic_sub_local
+; FUNC-LABEL: {{^}}atomic_sub_local:
 ; R600: LDS_SUB *
-; SI: DS_SUB_RTN_U32
+; SI: ds_sub_u32
 define void @atomic_sub_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
 
-; FUNC-LABEL: @atomic_sub_local_const_offset
+; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
 ; R600: LDS_SUB *
-; SI: DS_SUB_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
 }
 
-; FUNC-LABEL: @atomic_sub_ret_local
+; FUNC-LABEL: {{^}}atomic_sub_ret_local:
 ; R600: LDS_SUB_RET *
-; SI: DS_SUB_RTN_U32
+; SI: ds_sub_rtn_u32
 define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @atomic_sub_ret_local_const_offset
+; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
 ; R600: LDS_SUB_RET *
-; SI: DS_SUB_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x14
+; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
 define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
diff --git a/test/CodeGen/R600/basic-branch.ll b/test/CodeGen/R600/basic-branch.ll
index d084132..073ab79 100644
--- a/test/CodeGen/R600/basic-branch.ll
+++ b/test/CodeGen/R600/basic-branch.ll
@@ -1,7 +1,7 @@
 ; XFAIL: *
 ; RUN: llc -O0 -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
 
-; CHECK-LABEL: @test_branch(
+; CHECK-LABEL: {{^}}test_branch(
 define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
   %cmp = icmp ne i32 %val, 0
   br i1 %cmp, label %store, label %end
diff --git a/test/CodeGen/R600/basic-loop.ll b/test/CodeGen/R600/basic-loop.ll
index 6d0ff07..3cd609135 100644
--- a/test/CodeGen/R600/basic-loop.ll
+++ b/test/CodeGen/R600/basic-loop.ll
@@ -1,7 +1,7 @@
 ; XFAIL: *
 ; RUN: llc -O0 -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s
 
-; CHECK-LABEL: @test_loop:
+; CHECK-LABEL: {{^}}test_loop:
 define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
 entry:
   br label %loop.body
diff --git a/test/CodeGen/R600/bfe_uint.ll b/test/CodeGen/R600/bfe_uint.ll
index fe466e6..6fe23e9 100644
--- a/test/CodeGen/R600/bfe_uint.ll
+++ b/test/CodeGen/R600/bfe_uint.ll
@@ -2,7 +2,7 @@
 
 ; XFAIL: *
 
-; CHECK: @bfe_def
+; CHECK: {{^}}bfe_def:
 ; CHECK: BFE_UINT
 define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
 entry:
@@ -17,7 +17,7 @@ entry:
 ; implmented with a LSHR instruction, which is better, because LSHR has less
 ; operands and requires less constants.
 
-; CHECK: @bfe_shift
+; CHECK: {{^}}bfe_shift:
 ; CHECK-NOT: BFE_UINT
 define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
 entry:
diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
index bbfe856..2a0bb37 100644
--- a/test/CodeGen/R600/bfi_int.ll
+++ b/test/CodeGen/R600/bfi_int.ll
@@ -4,10 +4,10 @@
 ; BFI_INT Definition pattern from ISA docs
 ; (y & x) | (z & ~x)
 ;
-; R600-CHECK: @bfi_def
+; R600-CHECK: {{^}}bfi_def:
 ; R600-CHECK: BFI_INT
 ; SI-CHECK:   @bfi_def
-; SI-CHECK:   V_BFI_B32
+; SI-CHECK:   v_bfi_b32
 define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %x, -1
@@ -20,10 +20,10 @@ entry:
 
 ; SHA-256 Ch function
 ; z ^ (x & (y ^ z))
-; R600-CHECK: @bfi_sha256_ch
+; R600-CHECK: {{^}}bfi_sha256_ch:
 ; R600-CHECK: BFI_INT
 ; SI-CHECK:   @bfi_sha256_ch
-; SI-CHECK:   V_BFI_B32
+; SI-CHECK:   v_bfi_b32
 define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %y, %z
@@ -35,11 +35,11 @@ entry:
 
 ; SHA-256 Ma function
 ; ((x & z) | (y & (x | z)))
-; R600-CHECK: @bfi_sha256_ma
+; R600-CHECK: {{^}}bfi_sha256_ma:
 ; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
 ; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W
-; SI-CHECK: V_XOR_B32_e64 [[DST:v[0-9]+]], {{[sv][0-9]+, v[0-9]+}}
-; SI-CHECK: V_BFI_B32 {{v[0-9]+}}, [[DST]], {{[sv][0-9]+, [sv][0-9]+}}
+; SI-CHECK: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
+; SI-CHECK: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
 
 define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
diff --git a/test/CodeGen/R600/big_alu.ll b/test/CodeGen/R600/big_alu.ll
index 511e8ef..28be216 100644
--- a/test/CodeGen/R600/big_alu.ll
+++ b/test/CodeGen/R600/big_alu.ll
@@ -1,5 +1,4 @@
 ;RUN: llc < %s -march=r600 -mcpu=cedar
-;REQUIRES: asserts
 
 ;This test ensures that R600 backend can handle ifcvt properly
 ;and do not generate ALU clauses with more than 128 instructions.
diff --git a/test/CodeGen/R600/bitcast.ll b/test/CodeGen/R600/bitcast.ll
index 0be79e6..725d5ba 100644
--- a/test/CodeGen/R600/bitcast.ll
+++ b/test/CodeGen/R600/bitcast.ll
@@ -4,8 +4,8 @@
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-; FUNC-LABEL: @v32i8_to_v8i32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v32i8_to_v8i32:
+; SI: s_endpgm
 define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
 entry:
   %1 = load <32 x i8> addrspace(2)* %0
@@ -17,8 +17,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i8ptr_v16i8ptr
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
+; SI: s_endpgm
 define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
@@ -55,8 +55,8 @@ define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nou
   ret void
 }
 
-; FUNC-LABEL: @bitcast_v2i32_to_f64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
+; SI: s_endpgm
 define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %val = load <2 x i32> addrspace(1)* %in, align 8
   %add = add <2 x i32> %val, <i32 4, i32 9>
@@ -65,8 +65,8 @@ define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace
   ret void
 }
 
-; FUNC-LABEL: @bitcast_f64_to_v2i32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
+; SI: s_endpgm
 define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
   %val = load double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0
diff --git a/test/CodeGen/R600/bswap.ll b/test/CodeGen/R600/bswap.ll
index 6aebe85..1c5a0c6 100644
--- a/test/CodeGen/R600/bswap.ll
+++ b/test/CodeGen/R600/bswap.ll
@@ -1,12 +1,21 @@
-; RUN: llc -march=r600 -mcpu=SI < %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.bswap.i32(i32) nounwind readnone
 declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone
 declare i64 @llvm.bswap.i64(i64) nounwind readnone
 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
 declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
 
+; FUNC-LABEL: @test_bswap_i32
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8
+; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24
+; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff
+; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
 define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
@@ -14,6 +23,14 @@ define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
   ret void
 }
 
+; FUNC-LABEL: @test_bswap_v2i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI: s_endpgm
 define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
   %val = load <2 x i32> addrspace(1)* %in, align 8
   %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
@@ -21,6 +38,20 @@ define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(
   ret void
 }
 
+; FUNC-LABEL: @test_bswap_v4i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI: s_endpgm
 define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
   %val = load <4 x i32> addrspace(1)* %in, align 16
   %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
@@ -28,6 +59,39 @@ define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(
   ret void
 }
 
+; FUNC-LABEL: @test_bswap_v8i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI: s_endpgm
+define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
+  %val = load <8 x i32> addrspace(1)* %in, align 32
+  %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
+  store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
 define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %val = load i64 addrspace(1)* %in, align 8
   %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
diff --git a/test/CodeGen/R600/build_vector.ll b/test/CodeGen/R600/build_vector.ll
index 8179de1..9137eee 100644
--- a/test/CodeGen/R600/build_vector.ll
+++ b/test/CodeGen/R600/build_vector.ll
@@ -1,32 +1,32 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; R600-CHECK: @build_vector2
+; R600-CHECK: {{^}}build_vector2:
 ; R600-CHECK: MOV
 ; R600-CHECK: MOV
 ; R600-CHECK-NOT: MOV
-; SI-CHECK: @build_vector2
-; SI-CHECK-DAG: V_MOV_B32_e32 v[[X:[0-9]]], 5
-; SI-CHECK-DAG: V_MOV_B32_e32 v[[Y:[0-9]]], 6
-; SI-CHECK: BUFFER_STORE_DWORDX2 v{{\[}}[[X]]:[[Y]]{{\]}}
+; SI-CHECK: {{^}}build_vector2:
+; SI-CHECK-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
+; SI-CHECK-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
+; SI-CHECK: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
 define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
 entry:
   store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK: @build_vector4
+; R600-CHECK: {{^}}build_vector4:
 ; R600-CHECK: MOV
 ; R600-CHECK: MOV
 ; R600-CHECK: MOV
 ; R600-CHECK: MOV
 ; R600-CHECK-NOT: MOV
-; SI-CHECK: @build_vector4
-; SI-CHECK-DAG: V_MOV_B32_e32 v[[X:[0-9]]], 5
-; SI-CHECK-DAG: V_MOV_B32_e32 v[[Y:[0-9]]], 6
-; SI-CHECK-DAG: V_MOV_B32_e32 v[[Z:[0-9]]], 7
-; SI-CHECK-DAG: V_MOV_B32_e32 v[[W:[0-9]]], 8
-; SI-CHECK: BUFFER_STORE_DWORDX4 v{{\[}}[[X]]:[[W]]{{\]}}
+; SI-CHECK: {{^}}build_vector4:
+; SI-CHECK-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
+; SI-CHECK-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
+; SI-CHECK-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
+; SI-CHECK-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
+; SI-CHECK: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
 define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
diff --git a/test/CodeGen/R600/call.ll b/test/CodeGen/R600/call.ll
index d803474..1448f04 100644
--- a/test/CodeGen/R600/call.ll
+++ b/test/CodeGen/R600/call.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=r600 -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s
 ; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
 
-; CHECK: error: unsupported call to function defined_function in test_call
+; CHECK: error: unsupported call to function external_function in test_call_external
 
 
 declare i32 @external_function(i32) nounwind
diff --git a/test/CodeGen/R600/call_fs.ll b/test/CodeGen/R600/call_fs.ll
index f7c4e5b..7df2240 100644
--- a/test/CodeGen/R600/call_fs.ll
+++ b/test/CodeGen/R600/call_fs.ll
@@ -2,10 +2,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600-CHECK %s
 
-; EG-CHECK: @call_fs
+; EG-CHECK: {{^}}call_fs:
 ; EG-CHECK: .long 257
 ; EG-CHECK: CALL_FS  ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84]
-; R600-CHECK: @call_fs
+; R600-CHECK: {{^}}call_fs:
 ; R600-CHECK: .long 257
 ; R600-CHECK:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89]
 
diff --git a/test/CodeGen/R600/cayman-loop-bug.ll b/test/CodeGen/R600/cayman-loop-bug.ll
index a873528..c7b8c40 100644
--- a/test/CodeGen/R600/cayman-loop-bug.ll
+++ b/test/CodeGen/R600/cayman-loop-bug.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
 
-; CHECK-LABEL: @main
+; CHECK-LABEL: {{^}}main:
 ; CHECK: LOOP_START_DX10
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK: LOOP_START_DX10
diff --git a/test/CodeGen/R600/cf-stack-bug.ll b/test/CodeGen/R600/cf-stack-bug.ll
index c3a4612..02c87d7 100644
--- a/test/CodeGen/R600/cf-stack-bug.ll
+++ b/test/CodeGen/R600/cf-stack-bug.ll
@@ -17,7 +17,7 @@
 ; BUG64-NOT: Applying bug work-around
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
-; FUNC-LABEL: @nested3
+; FUNC-LABEL: {{^}}nested3:
 define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
@@ -50,7 +50,7 @@ end:
 ; BUG64: Applying bug work-around
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
-; FUNC-LABEL: @nested4
+; FUNC-LABEL: {{^}}nested4:
 define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
@@ -91,7 +91,7 @@ end:
 ; BUG64: Applying bug work-around
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
-; FUNC-LABEL: @nested7
+; FUNC-LABEL: {{^}}nested7:
 define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
@@ -156,7 +156,7 @@ end:
 ; BUG64: Applying bug work-around
 ; BUG32: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
-; FUNC-LABEL: @nested8
+; FUNC-LABEL: {{^}}nested8:
 define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
diff --git a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
index f8b4a61..b42b904 100644
--- a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
@@ -1,14 +1,15 @@
-; RUN: opt -codegenprepare -S -o - %s | FileCheck --check-prefix=OPT --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-LLC --check-prefix=FUNC %s
+; RUN: opt -codegenprepare -S -o - %s | FileCheck --check-prefix=OPT %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-LLC %s
 
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 target triple = "r600--"
 
-; FUNC-LABEL: @test
+; OPT-LABEL: @test
 ; OPT: mul nsw i32
 ; OPT-NEXT: sext
-; SI-LLC: V_MUL_LO_I32
-; SI-LLC-NOT: V_MUL_HI
+; SI-LLC-LABEL: {{^}}test:
+; SI-LLC: s_mul_i32
+; SI-LLC-NOT: mul
 define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
 entry:
   %0 = mul nsw i32 %a, 3
diff --git a/test/CodeGen/R600/combine_vloads.ll b/test/CodeGen/R600/combine_vloads.ll
index f8ec712..38420b2 100644
--- a/test/CodeGen/R600/combine_vloads.ll
+++ b/test/CodeGen/R600/combine_vloads.ll
@@ -9,7 +9,7 @@
 
 
 ; 128-bit loads instead of many 8-bit
-; EG-LABEL: @combine_vloads:
+; EG-LABEL: {{^}}combine_vloads:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
diff --git a/test/CodeGen/R600/commute_modifiers.ll b/test/CodeGen/R600/commute_modifiers.ll
new file mode 100644
index 0000000..30c8067
--- /dev/null
+++ b/test/CodeGen/R600/commute_modifiers.ll
@@ -0,0 +1,181 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+
+; FUNC-LABEL: @commute_add_imm_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %x = load float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %z = fadd float 2.0, %x.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %x = load float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %x.fneg.fabs = fsub float -0.000000e+00, %x.fabs
+  %z = fmul float 4.0, %x.fneg.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_imm_fneg_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %x = load float addrspace(1)* %gep.0
+  %x.fneg = fsub float -0.000000e+00, %x
+  %z = fmul float 4.0, %x.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should use SGPR for literal.
+; FUNC-LABEL: @commute_add_lit_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %x = load float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %z = fadd float 1024.0, %x.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_add_fabs_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %z = fadd float %x, %y.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fneg_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %y.fneg = fsub float -0.000000e+00, %y
+  %z = fmul float %x, %y.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fabs_fneg_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
+  %z = fmul float %x, %y.fabs.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; There's no reason to commute this.
+; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %z = fmul float %x.fabs, %y.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
+  %z = fmul float %x.fabs, %y.fabs.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; Make sure we commute the multiply part for the constant in src0 even
+; though we have negate modifier on src2.
+
+; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]|
+; SI: buffer_store_dword [[RESULT]]
+define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r2.fabs = call float @llvm.fabs.f32(float %r2)
+
+  %r3 = tail call float @llvm.fma.f32(float %r1, float 2.0, float %r2.fabs)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/complex-folding.ll b/test/CodeGen/R600/complex-folding.ll
index 99f0d99..a5399a7 100644
--- a/test/CodeGen/R600/complex-folding.ll
+++ b/test/CodeGen/R600/complex-folding.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: @main
+; CHECK: {{^}}main:
 ; CHECK-NOT: MOV
 define void @main(<4 x float> inreg %reg0) #0 {
 entry:
diff --git a/test/CodeGen/R600/concat_vectors.ll b/test/CodeGen/R600/concat_vectors.ll
new file mode 100644
index 0000000..19992eb
--- /dev/null
+++ b/test/CodeGen/R600/concat_vectors.ll
@@ -0,0 +1,284 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_concat_v1i32:
+; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF
+; instructions that access scratch memory.  Bit 23, which is the add_tid_enable
+; bit, is only set for scratch access, so we can check for the absence of this
+; value if we want to ensure scratch memory is not being used.
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+  %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+  %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+  %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
+  %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
+  %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+  %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+  %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+  %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+  %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+  %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+  %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
+  %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x i1> %concat, <2 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
+  %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i1> %concat, <4 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
+  %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i1> %concat, <8 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
+  %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i1> %concat, <16 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
+  %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i1> %concat, <32 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v32i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
+  %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  store <64 x i1> %concat, <64 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
+  %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
+  %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+  %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+  %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
+  %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64
+  ret void
+}
diff --git a/test/CodeGen/R600/copy-illegal-type.ll b/test/CodeGen/R600/copy-illegal-type.ll
new file mode 100644
index 0000000..66ea88e
--- /dev/null
+++ b/test/CodeGen/R600/copy-illegal-type.ll
@@ -0,0 +1,166 @@
+; RUN: llc -march=r600 -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_copy_v4i8:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI_DAG: buffer_store_byte
+
+; After scalarizing v4i8 loads is fixed.
+; XSI: buffer_load_dword
+; XSI: V_BFE
+; XSI: V_ADD
+; XSI: V_ADD
+; XSI: V_ADD
+; XSI: buffer_store_dword
+; XSI: buffer_store_dword
+
+; SI: s_endpgm
+define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI_DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI_DAG: buffer_store_byte
+
+; XSI: buffer_load_dword
+; XSI: BFE
+; XSI: buffer_store_dword
+; XSI: V_ADD
+; XSI: buffer_store_dword
+; XSI-NEXT: buffer_store_dword
+
+; SI: s_endpgm
+define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v3i8:
+; SI-NOT: bfe
+; SI-NOT: bfi
+; SI: s_endpgm
+define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+  %val = load <3 x i8> addrspace(1)* %in, align 4
+  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: s_endpgm
+define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load volatile <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/copy-to-reg.ll b/test/CodeGen/R600/copy-to-reg.ll
new file mode 100644
index 0000000..f90ee78
--- /dev/null
+++ b/test/CodeGen/R600/copy-to-reg.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=r600 -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
+
+; Test that CopyToReg instructions don't have non-register operands prior
+; to being emitted.
+
+; Make sure this doesn't crash
+; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
+define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %alloca = alloca [16 x i32]
+  br label %loop
+
+loop:
+  %inc = phi i32 [0, %entry], [%inc.i, %loop]
+  %ptr = getelementptr [16 x i32]* %alloca, i32 0, i32 %inc
+  store i32 %inc, i32* %ptr
+  %inc.i = add i32 %inc, 1
+  %cnd = icmp uge i32 %inc.i, 16
+  br i1 %cnd, label %done, label %loop
+
+done:
+  %tmp0 = getelementptr [16 x i32]* %alloca, i32 0, i32 0
+  %tmp1 = load i32* %tmp0
+  store i32 %tmp1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/ctlz_zero_undef.ll b/test/CodeGen/R600/ctlz_zero_undef.ll
index 15b5188..f699127 100644
--- a/test/CodeGen/R600/ctlz_zero_undef.ll
+++ b/test/CodeGen/R600/ctlz_zero_undef.ll
@@ -1,26 +1,31 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
 
-; FUNC-LABEL: @s_ctlz_zero_undef_i32:
-; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
-; SI: S_FLBIT_I32_B32 [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @v_ctlz_zero_undef_i32:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_FFBH_U32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
@@ -28,12 +33,15 @@ define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
   ret void
 }
 
-; FUNC-LABEL: @v_ctlz_zero_undef_v2i32:
-; SI: BUFFER_LOAD_DWORDX2
-; SI: V_FFBH_U32_e32
-; SI: V_FFBH_U32_e32
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32:
+; SI: buffer_load_dwordx2
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
@@ -41,14 +49,19 @@ define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
   ret void
 }
 
-; FUNC-LABEL: @v_ctlz_zero_undef_v4i32:
-; SI: BUFFER_LOAD_DWORDX4
-; SI: V_FFBH_U32_e32
-; SI: V_FFBH_U32_e32
-; SI: V_FFBH_U32_e32
-; SI: V_FFBH_U32_e32
-; SI: BUFFER_STORE_DWORDX4
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32:
+; SI: buffer_load_dwordx4
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll
index 15be8e1..5cfdaef 100644
--- a/test/CodeGen/R600/ctpop.ll
+++ b/test/CodeGen/R600/ctpop.ll
@@ -7,12 +7,12 @@ declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
 declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
 declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
 
-; FUNC-LABEL: @s_ctpop_i32:
-; SI: S_LOAD_DWORD [[SVAL:s[0-9]+]],
-; SI: S_BCNT1_I32_B32 [[SRESULT:s[0-9]+]], [[SVAL]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_ctpop_i32:
+; SI: s_load_dword [[SVAL:s[0-9]+]],
+; SI: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
@@ -22,12 +22,12 @@ define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 }
 
 ; XXX - Why 0 in register?
-; FUNC-LABEL: @v_ctpop_i32:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
-; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VZERO]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VZERO]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
@@ -37,15 +37,14 @@ define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noali
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_add_chain_i32
-; SI: BUFFER_LOAD_DWORD [[VAL0:v[0-9]+]],
-; SI: BUFFER_LOAD_DWORD [[VAL1:v[0-9]+]],
-; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
-; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
-; SI-NOT: ADD
-; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
+; SI: buffer_load_dword [[VAL0:v[0-9]+]],
+; SI: buffer_load_dword [[VAL1:v[0-9]+]],
+; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
+; SI: v_bcnt_u32_b32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
+; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -59,10 +58,24 @@ define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_v2i32:
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
+; SI: buffer_load_dword [[VAL0:v[0-9]+]],
+; SI-NEXT: s_waitcnt
+; SI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
+; SI-NEXT: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
+  %val0 = load i32 addrspace(1)* %in0, align 4
+  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+  %add = add i32 %ctpop0, %sval
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v2i32:
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -73,12 +86,12 @@ define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrs
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_v4i32:
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_v4i32:
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -91,16 +104,16 @@ define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrs
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_v8i32:
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_v8i32:
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -117,24 +130,24 @@ define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrs
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_v16i32:
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_v16i32:
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -159,11 +172,11 @@ define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> ad
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i32_add_inline_constant:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
@@ -174,11 +187,11 @@ define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i32_add_inline_constant_inv:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
@@ -189,12 +202,12 @@ define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out,
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i32_add_literal:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_MOV_B32_e32 [[LIT:v[0-9]+]], 0x1869f
-; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
@@ -203,12 +216,12 @@ define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspa
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i32_add_var:
-; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI-DAG: S_LOAD_DWORD [[VAR:s[0-9]+]],
-; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; SI-DAG: s_load_dword [[VAR:s[0-9]+]],
+; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
@@ -219,12 +232,12 @@ define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i32_add_var_inv:
-; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI-DAG: S_LOAD_DWORD [[VAR:s[0-9]+]],
-; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; SI-DAG: s_load_dword [[VAR:s[0-9]+]],
+; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
@@ -235,12 +248,12 @@ define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspa
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i32_add_vvar_inv
-; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], {{.*}} + 0x0
-; SI-DAG: BUFFER_LOAD_DWORD [[VAR:v[0-9]+]], {{.*}} + 0x10
-; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
+; SI-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:0x10
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
@@ -256,29 +269,29 @@ define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrsp
 ; FIXME: We currently disallow SALU instructions in all branches,
 ; but there are some cases when the should be allowed.
 
-; FUNC-LABEL: @ctpop_i32_in_br
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
-; EG: BCNT_INT
-define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+; FUNC-LABEL: {{^}}ctpop_i32_in_br:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
+; SI: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[RESULT]], [[SRESULT]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+; EG: BCNT_INT
+define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
 entry:
-  %0 = icmp eq i32 %cond, 0
-  br i1 %0, label %if, label %else
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %else
 
 if:
-  %1 = load i32 addrspace(1)* %in
-  %2 = call i32 @llvm.ctpop.i32(i32 %1)
+  %tmp2 = call i32 @llvm.ctpop.i32(i32 %ctpop_arg)
   br label %endif
 
 else:
-  %3 = getelementptr i32 addrspace(1)* %in, i32 1
-  %4 = load i32 addrspace(1)* %3
+  %tmp3 = getelementptr i32 addrspace(1)* %in, i32 1
+  %tmp4 = load i32 addrspace(1)* %tmp3
   br label %endif
 
 endif:
-  %5 = phi i32 [%2, %if], [%4, %else]
-  store i32 %5, i32 addrspace(1)* %out
+  %tmp5 = phi i32 [%tmp2, %if], [%tmp4, %else]
+  store i32 %tmp5, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/ctpop64.ll b/test/CodeGen/R600/ctpop64.ll
index b36ecc6..2efac8f 100644
--- a/test/CodeGen/R600/ctpop64.ll
+++ b/test/CodeGen/R600/ctpop64.ll
@@ -6,12 +6,12 @@ declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
 declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
 declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
 
-; FUNC-LABEL: @s_ctpop_i64:
-; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]],
-; SI: S_BCNT1_I32_B64 [[SRESULT:s[0-9]+]], [[SVAL]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_ctpop_i64:
+; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
 define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
@@ -19,13 +19,13 @@ define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i64:
-; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
-; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
-; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], [[VZERO]]
-; SI-NEXT: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i64:
+; SI: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
+; SI: v_bcnt_u32_b32_e32 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], [[VZERO]]
+; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %val = load i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
@@ -34,10 +34,10 @@ define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noali
   ret void
 }
 
-; FUNC-LABEL: @s_ctpop_v2i64:
-; SI: S_BCNT1_I32_B64
-; SI: S_BCNT1_I32_B64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_ctpop_v2i64:
+; SI: s_bcnt1_i32_b64
+; SI: s_bcnt1_i32_b64
+; SI: s_endpgm
 define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@@ -45,12 +45,12 @@ define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val)
   ret void
 }
 
-; FUNC-LABEL: @s_ctpop_v4i64:
-; SI: S_BCNT1_I32_B64
-; SI: S_BCNT1_I32_B64
-; SI: S_BCNT1_I32_B64
-; SI: S_BCNT1_I32_B64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_ctpop_v4i64:
+; SI: s_bcnt1_i32_b64
+; SI: s_bcnt1_i32_b64
+; SI: s_bcnt1_i32_b64
+; SI: s_bcnt1_i32_b64
+; SI: s_endpgm
 define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@@ -58,12 +58,12 @@ define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val)
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_v2i64:
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_v2i64:
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: s_endpgm
 define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i64> addrspace(1)* %in, align 16
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
@@ -72,16 +72,16 @@ define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrs
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_v4i64:
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_v4i64:
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: s_endpgm
 define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i64> addrspace(1)* %in, align 32
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
@@ -93,30 +93,29 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs
 ; FIXME: We currently disallow SALU instructions in all branches,
 ; but there are some cases when the should be allowed.
 
-; FUNC-LABEL: @ctpop_i64_in_br
-; SI: V_BCNT_U32_B32_e64 [[BCNT_LO:v[0-9]+]], v{{[0-9]+}}, 0
-; SI: V_BCNT_U32_B32_e32 v[[BCNT:[0-9]+]], v{{[0-9]+}}, [[BCNT_LO]]
-; SI: V_MOV_B32_e32 v[[ZERO:[0-9]+]], 0
-; SI: BUFFER_STORE_DWORDX2 v[
-; SI: [[BCNT]]:[[ZERO]]]
-; SI: S_ENDPGM
-define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i32 %cond) {
+; FUNC-LABEL: {{^}}ctpop_i64_in_br:
+; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
+; SI: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
+; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
+; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
+; SI: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
+; SI: s_endpgm
+define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
 entry:
-  %0 = icmp eq i32 %cond, 0
-  br i1 %0, label %if, label %else
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %else
 
 if:
-  %1 = load i64 addrspace(1)* %in
-  %2 = call i64 @llvm.ctpop.i64(i64 %1)
+  %tmp2 = call i64 @llvm.ctpop.i64(i64 %ctpop_arg)
   br label %endif
 
 else:
-  %3 = getelementptr i64 addrspace(1)* %in, i32 1
-  %4 = load i64 addrspace(1)* %3
+  %tmp3 = getelementptr i64 addrspace(1)* %in, i32 1
+  %tmp4 = load i64 addrspace(1)* %tmp3
   br label %endif
 
 endif:
-  %5 = phi i64 [%2, %if], [%4, %else]
-  store i64 %5, i64 addrspace(1)* %out
+  %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else]
+  store i64 %tmp5, i64 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/cttz_zero_undef.ll b/test/CodeGen/R600/cttz_zero_undef.ll
index cf44f8e..c4b1463 100644
--- a/test/CodeGen/R600/cttz_zero_undef.ll
+++ b/test/CodeGen/R600/cttz_zero_undef.ll
@@ -1,26 +1,31 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
 declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
 declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
 
-; FUNC-LABEL: @s_cttz_zero_undef_i32:
-; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
-; SI: S_FF1_I32_B32 [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @v_cttz_zero_undef_i32:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_FFBL_B32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32 addrspace(1)* %valptr, align 4
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
@@ -28,12 +33,15 @@ define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
   ret void
 }
 
-; FUNC-LABEL: @v_cttz_zero_undef_v2i32:
-; SI: BUFFER_LOAD_DWORDX2
-; SI: V_FFBL_B32_e32
-; SI: V_FFBL_B32_e32
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
+; SI: buffer_load_dwordx2
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32> addrspace(1)* %valptr, align 8
   %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
@@ -41,14 +49,19 @@ define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
   ret void
 }
 
-; FUNC-LABEL: @v_cttz_zero_undef_v4i32:
-; SI: BUFFER_LOAD_DWORDX4
-; SI: V_FFBL_B32_e32
-; SI: V_FFBL_B32_e32
-; SI: V_FFBL_B32_e32
-; SI: V_FFBL_B32_e32
-; SI: BUFFER_STORE_DWORDX4
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
+; SI: buffer_load_dwordx4
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32> addrspace(1)* %valptr, align 16
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll
index fe97a44..0d1db19 100644
--- a/test/CodeGen/R600/cvt_f32_ubyte.ll
+++ b/test/CodeGen/R600/cvt_f32_ubyte.ll
@@ -1,11 +1,11 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @load_i8_to_f32:
-; SI: BUFFER_LOAD_UBYTE [[LOADREG:v[0-9]+]],
-; SI-NOT: BFE
-; SI-NOT: LSHR
-; SI: V_CVT_F32_UBYTE0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
-; SI: BUFFER_STORE_DWORD [[CONV]],
+; SI-LABEL: {{^}}load_i8_to_f32:
+; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]],
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
+; SI: buffer_store_dword [[CONV]],
 define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8 addrspace(1)* %in, align 1
   %cvt = uitofp i8 %load to float
@@ -13,14 +13,14 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n
   ret void
 }
 
-; SI-LABEL: @load_v2i8_to_v2f32:
-; SI: BUFFER_LOAD_USHORT [[LOADREG:v[0-9]+]],
-; SI-NOT: BFE
-; SI-NOT: LSHR
-; SI-NOT: AND
-; SI-DAG: V_CVT_F32_UBYTE1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
-; SI-DAG: V_CVT_F32_UBYTE0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
-; SI: BUFFER_STORE_DWORDX2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+; SI-LABEL: {{^}}load_v2i8_to_v2f32:
+; SI: buffer_load_ushort [[LOADREG:v[0-9]+]],
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI-NOT: and
+; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <2 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <2 x i8> %load to <2 x float>
@@ -28,13 +28,13 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
   ret void
 }
 
-; SI-LABEL: @load_v3i8_to_v3f32:
-; SI-NOT: BFE
-; SI-NOT: V_CVT_F32_UBYTE3_e32
-; SI-DAG: V_CVT_F32_UBYTE2_e32
-; SI-DAG: V_CVT_F32_UBYTE1_e32
-; SI-DAG: V_CVT_F32_UBYTE0_e32
-; SI: BUFFER_STORE_DWORDX2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+; SI-LABEL: {{^}}load_v3i8_to_v3f32:
+; SI-NOT: bfe
+; SI-NOT: v_cvt_f32_ubyte3_e32
+; SI-DAG: v_cvt_f32_ubyte2_e32
+; SI-DAG: v_cvt_f32_ubyte1_e32
+; SI-DAG: v_cvt_f32_ubyte0_e32
+; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <3 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <3 x i8> %load to <3 x float>
@@ -42,15 +42,19 @@ define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8>
   ret void
 }
 
-; SI-LABEL: @load_v4i8_to_v4f32:
-; SI: BUFFER_LOAD_DWORD [[LOADREG:v[0-9]+]],
-; SI-NOT: BFE
-; SI-NOT: LSHR
-; SI-DAG: V_CVT_F32_UBYTE3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
-; SI-DAG: V_CVT_F32_UBYTE2_e32 v{{[0-9]+}}, [[LOADREG]]
-; SI-DAG: V_CVT_F32_UBYTE1_e32 v{{[0-9]+}}, [[LOADREG]]
-; SI-DAG: V_CVT_F32_UBYTE0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
-; SI: BUFFER_STORE_DWORDX4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+; SI-LABEL: {{^}}load_v4i8_to_v4f32:
+; We can't use buffer_load_dword here, because the load is byte aligned, and
+; buffer_load_dword requires dword alignment.
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: v_or_b32_e32 [[LOADREG:v[0-9]+]]
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <4 x i8> %load to <4 x float>
@@ -58,27 +62,27 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
   ret void
 }
 
-; XXX - This should really still be able to use the V_CVT_F32_UBYTE0
+; XXX - This should really still be able to use the v_cvt_f32_ubyte0
 ; for each component, but computeKnownBits doesn't handle vectors very
 ; well.
 
-; SI-LABEL: @load_v4i8_to_v4f32_2_uses:
-; SI: BUFFER_LOAD_UBYTE
-; SI: V_CVT_F32_UBYTE0_e32
-; SI: BUFFER_LOAD_UBYTE
-; SI: V_CVT_F32_UBYTE0_e32
-; SI: BUFFER_LOAD_UBYTE
-; SI: V_CVT_F32_UBYTE0_e32
-; SI: BUFFER_LOAD_UBYTE
-; SI: V_CVT_F32_UBYTE0_e32
+; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: v_cvt_f32_ubyte0_e32
+; SI: v_cvt_f32_ubyte0_e32
+; SI: v_cvt_f32_ubyte0_e32
+; SI: v_cvt_f32_ubyte0_e32
 
 ; XXX - replace with this when v4i8 loads aren't scalarized anymore.
-; XSI: BUFFER_LOAD_DWORD
-; XSI: V_CVT_F32_U32_e32
-; XSI: V_CVT_F32_U32_e32
-; XSI: V_CVT_F32_U32_e32
-; XSI: V_CVT_F32_U32_e32
-; SI: S_ENDPGM
+; XSI: buffer_load_dword
+; XSI: v_cvt_f32_u32_e32
+; XSI: v_cvt_f32_u32_e32
+; XSI: v_cvt_f32_u32_e32
+; XSI: v_cvt_f32_u32_e32
+; SI: s_endpgm
 define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <4 x i8> %load to <4 x float>
@@ -89,8 +93,8 @@ define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <
 }
 
 ; Make sure this doesn't crash.
-; SI-LABEL: @load_v7i8_to_v7f32:
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}load_v7i8_to_v7f32:
+; SI: s_endpgm
 define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <7 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <7 x i8> %load to <7 x float>
@@ -98,28 +102,28 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
   ret void
 }
 
-; SI-LABEL: @load_v8i8_to_v8f32:
-; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
-; SI-NOT: BFE
-; SI-NOT: LSHR
-; SI-DAG: V_CVT_F32_UBYTE3_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; SI-DAG: V_CVT_F32_UBYTE2_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; SI-DAG: V_CVT_F32_UBYTE1_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; SI-DAG: V_CVT_F32_UBYTE0_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; SI-DAG: V_CVT_F32_UBYTE3_e32 v{{[0-9]+}}, v[[HILOAD]]
-; SI-DAG: V_CVT_F32_UBYTE2_e32 v{{[0-9]+}}, v[[HILOAD]]
-; SI-DAG: V_CVT_F32_UBYTE1_e32 v{{[0-9]+}}, v[[HILOAD]]
-; SI-DAG: V_CVT_F32_UBYTE0_e32 v{{[0-9]+}}, v[[HILOAD]]
-; SI-NOT: BFE
-; SI-NOT: LSHR
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
+; SI-LABEL: {{^}}load_v8i8_to_v8f32:
+; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <8 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <8 x i8> %load to <8 x float>
@@ -127,11 +131,11 @@ define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8>
   ret void
 }
 
-; SI-LABEL: @i8_zext_inreg_i32_to_f32:
-; SI: BUFFER_LOAD_DWORD [[LOADREG:v[0-9]+]],
-; SI: V_ADD_I32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]]
-; SI-NEXT: V_CVT_F32_UBYTE0_e32 [[CONV:v[0-9]+]], [[ADD]]
-; SI: BUFFER_STORE_DWORD [[CONV]],
+; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
+; SI: buffer_load_dword [[LOADREG:v[0-9]+]],
+; SI: v_add_i32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]]
+; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
+; SI: buffer_store_dword [[CONV]],
 define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 2
@@ -141,7 +145,7 @@ define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addr
   ret void
 }
 
-; SI-LABEL: @i8_zext_inreg_hi1_to_f32:
+; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
 define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %inreg = and i32 %load, 65280
diff --git a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
index 6607c12..1e47bfa 100644
--- a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
+++ b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
@@ -7,7 +7,7 @@
 ; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes.
 
 
-; CHECK: @sint
+; CHECK: {{^}}sint:
 ; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -21,7 +21,7 @@ entry:
   ret void
 }
 
-;CHECK: @uint
+;CHECK: {{^}}uint:
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/default-fp-mode.ll b/test/CodeGen/R600/default-fp-mode.ll
index 214b2c2..935bf97 100644
--- a/test/CodeGen/R600/default-fp-mode.ll
+++ b/test/CodeGen/R600/default-fp-mode.ll
@@ -1,8 +1,27 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
 
-; SI-LABEL: @test_kernel
-; SI: FloatMode: 240
-; SI: IeeeMode: 0
+; FUNC-LABEL: {{^}}test_kernel:
+
+; DEFAULT: FloatMode: 192
+; DEFAULT: IeeeMode: 0
+
+; FP64-DENORMAL: FloatMode: 192
+; FP64-DENORMAL: IeeeMode: 0
+
+; FP32-DENORMAL: FloatMode: 48
+; FP32-DENORMAL: IeeeMode: 0
+
+; BOTH-DENORMAL: FloatMode: 240
+; BOTH-DENORMAL: IeeeMode: 0
+
+; NO-DENORMAL: FloatMode: 0
+; NO-DENORMAL: IeeeMode: 0
 define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
diff --git a/test/CodeGen/R600/disconnected-predset-break-bug.ll b/test/CodeGen/R600/disconnected-predset-break-bug.ll
index 012c17b..858e4b9 100644
--- a/test/CodeGen/R600/disconnected-predset-break-bug.ll
+++ b/test/CodeGen/R600/disconnected-predset-break-bug.ll
@@ -4,7 +4,7 @@
 ; result.  This tests that there are no instructions between the PRED_SET*
 ; and the PREDICATE_BREAK in this loop.
 
-; CHECK: @loop_ge
+; CHECK: {{^}}loop_ge:
 ; CHECK: LOOP_START_DX10
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK-NEXT: JUMP
diff --git a/test/CodeGen/R600/dot4-folding.ll b/test/CodeGen/R600/dot4-folding.ll
index 3e8330f..dca6a59 100644
--- a/test/CodeGen/R600/dot4-folding.ll
+++ b/test/CodeGen/R600/dot4-folding.ll
@@ -2,7 +2,7 @@
 
 ; Exactly one constant vector can be folded into dot4, which means exactly
 ; 4 MOV instructions
-; CHECK: @main
+; CHECK: {{^}}main:
 ; CHECK: MOV
 ; CHECK: MOV
 ; CHECK: MOV
diff --git a/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
new file mode 100644
index 0000000..f334062
--- /dev/null
+++ b/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
@@ -0,0 +1,69 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local() #1
+
+; Function Attrs: nounwind
+; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop:
+; CHECK: BB0_1:
+; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
+; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], 4, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], 0x80, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], 0x84, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
+
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:0 offset1:1
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33
+; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
+; CHECK: s_endpgm
+define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
+entry:
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #0
+  %mul = shl nsw i32 %x.i, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ]
+  %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ]
+  %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @llvm.AMDGPU.barrier.local() #1
+  %arrayidx = getelementptr inbounds float addrspace(3)* %lptr, i32 %offset.02
+  %tmp = load float addrspace(3)* %arrayidx, align 4
+  %add1 = add nsw i32 %offset.02, 1
+  %arrayidx2 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add1
+  %tmp1 = load float addrspace(3)* %arrayidx2, align 4
+  %add3 = add nsw i32 %offset.02, 32
+  %arrayidx4 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add3
+  %tmp2 = load float addrspace(3)* %arrayidx4, align 4
+  %add5 = add nsw i32 %offset.02, 33
+  %arrayidx6 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add5
+  %tmp3 = load float addrspace(3)* %arrayidx6, align 4
+  %add7 = add nsw i32 %offset.02, 64
+  %arrayidx8 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add7
+  %tmp4 = load float addrspace(3)* %arrayidx8, align 4
+  %add9 = fadd float %tmp, %tmp1
+  %add10 = fadd float %add9, %tmp2
+  %add11 = fadd float %add10, %tmp3
+  %add12 = fadd float %add11, %tmp4
+  %add13 = fadd float %sum.03, %add12
+  %inc = add nsw i32 %k.01, 1
+  %add14 = add nsw i32 %offset.02, 97
+  %exitcond = icmp eq i32 %inc, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %tmp5 = sext i32 %x.i to i64
+  %arrayidx15 = getelementptr inbounds float addrspace(1)* %out, i64 %tmp5
+  store float %add13, float addrspace(1)* %arrayidx15, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { noduplicate nounwind }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/R600/ds_read2.ll b/test/CodeGen/R600/ds_read2.ll
new file mode 100644
index 0000000..6e0c8be
--- /dev/null
+++ b/test/CodeGen/R600/ds_read2.ll
@@ -0,0 +1,515 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+; FIXME: We don't get cases where the address was an SGPR because we
+; get a copy to the address register for each one.
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+ @lds.f64 = addrspace(3) global [512 x double] undef, align 8
+
+; SI-LABEL: @simple_read2_f32
+; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:8
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2_f32(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_max_offset
+; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:255
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 255
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_too_far
+; SI-NOT ds_read2_b32
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
+; SI: s_endpgm
+define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 257
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_x2
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:0 offset1:8
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 0
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum.0 = fadd float %val0, %val1
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float addrspace(3)* %arrayidx3, align 4
+  %sum.1 = fadd float %val2, %val3
+
+  %sum = fadd float %sum.0, %sum.1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %idx.0
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Make sure there is an instruction between the two sets of reads.
+; SI-LABEL: @simple_read2_f32_x2_barrier
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:0 offset1:8
+; SI: s_barrier
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 0
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum.0 = fadd float %val0, %val1
+
+  call void @llvm.AMDGPU.barrier.local() #2
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float addrspace(3)* %arrayidx3, align 4
+  %sum.1 = fadd float %val2, %val3
+
+  %sum = fadd float %sum.0, %sum.1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %idx.0
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; For some reason adding something to the base address for the first
+; element results in only folding the inner pair.
+
+; SI-LABEL: @simple_read2_f32_x2_nonzero_base
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum.0 = fadd float %val0, %val1
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float addrspace(3)* %arrayidx3, align 4
+  %sum.1 = fadd float %val2, %val3
+
+  %sum = fadd float %sum.0, %sum.1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %idx.0
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Be careful of vectors of pointers. We don't know if the 2 pointers
+; in the vectors are really the same base, so this is not safe to
+; merge.
+; Base pointers come from different subregister of same super
+; register. We can't safely merge this.
+
+; SI-LABEL: @read2_ptr_is_subreg_arg_f32
+; SI-NOT: ds_read2_b32
+; SI: ds_read_b32
+; SI: ds_read_b32
+; SI: s_endpgm
+define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
+  %gep = getelementptr inbounds <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+  %val0 = load float addrspace(3)* %gep.0, align 4
+  %val1 = load float addrspace(3)* %gep.1, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Apply a constant scalar offset after the pointer vector extract.  We
+; are rejecting merges that have the same, constant 0 offset, so make
+; sure we are really rejecting it because of the different
+; subregisters.
+
+; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32
+; SI-NOT: ds_read2_b32
+; SI: ds_read_b32
+; SI: ds_read_b32
+; SI: s_endpgm
+define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
+  %gep = getelementptr inbounds <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+
+  ; Apply an additional offset after the vector that will be more obviously folded.
+  %gep.1.offset = getelementptr float addrspace(3)* %gep.1, i32 8
+
+  %val0 = load float addrspace(3)* %gep.0, align 4
+  %val1 = load float addrspace(3)* %gep.1.offset, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; We should be able to merge in this case, but probably not worth the effort.
+; SI-NOT: ds_read2_b32
+; SI: ds_read_b32
+; SI: ds_read_b32
+; SI: s_endpgm
+define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
+  %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
+  %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1
+  %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8>
+  %gep = getelementptr inbounds <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+  %val0 = load float addrspace(3)* %gep.0, align 4
+  %val1 = load float addrspace(3)* %gep.1, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_volatile_0
+; SI-NOT ds_read2_b32
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load volatile float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_volatile_1
+; SI-NOT ds_read2_b32
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load volatile float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Can't fold since not correctly aligned.
+; XXX: This isn't really testing anything useful now. I think CI
+; allows unaligned LDS accesses, which would be a problem here.
+; SI-LABEL: @unaligned_read2_f32
+; SI-NOT: ds_read2_b32
+; SI: s_endpgm
+define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 1
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 1
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @misaligned_2_simple_read2_f32
+; SI-NOT: ds_read2_b32
+; SI: s_endpgm
+define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 2
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 2
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f64
+; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
+; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset0:0 offset1:8
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2_f64(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f64_max_offset
+; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:0 offset1:255
+; SI: s_endpgm
+define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 255
+  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f64_too_far
+; SI-NOT ds_read2_b64
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
+; SI: s_endpgm
+define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 257
+  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; Alignment only 4
+; SI-LABEL: @misaligned_read2_f64
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
+; SI: s_endpgm
+define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 7
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 4
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+@foo = addrspace(3) global [4 x i32] undef, align 4
+
+; SI-LABEL: @load_constant_adjacent_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:1
+define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
+  %val0 = load i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  %val1 = load i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
+  %sum = add i32 %val0, %val1
+  store i32 %sum, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @load_constant_disjoint_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:2
+define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
+  %val0 = load i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  %val1 = load i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
+  %sum = add i32 %val0, %val1
+  store i32 %sum, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@bar = addrspace(3) global [4 x i64] undef, align 4
+
+; SI-LABEL: @load_misaligned64_constant_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
+define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
+  %val0 = load i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
+  %val1 = load i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
+  %sum = add i64 %val0, %val1
+  store i64 %sum, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+@bar.large = addrspace(3) global [4096 x i64] undef, align 4
+
+; SI-LABEL: @load_misaligned64_constant_large_offsets
+; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
+; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
+; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset0:0 offset1:1
+; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset0:0 offset1:1
+; SI: s_endpgm
+define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
+  %val0 = load i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
+  %val1 = load i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
+  %sum = add i64 %val0, %val1
+  store i64 %sum, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
+@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
+
+define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %arrayidx44 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
+  %tmp16 = load float addrspace(3)* %arrayidx44, align 4
+  %add47 = add nsw i32 %x.i, 1
+  %arrayidx48 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
+  %tmp17 = load float addrspace(3)* %arrayidx48, align 4
+  %add51 = add nsw i32 %x.i, 16
+  %arrayidx52 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
+  %tmp18 = load float addrspace(3)* %arrayidx52, align 4
+  %add55 = add nsw i32 %x.i, 17
+  %arrayidx56 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
+  %tmp19 = load float addrspace(3)* %arrayidx56, align 4
+  %arrayidx60 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
+  %tmp20 = load float addrspace(3)* %arrayidx60, align 4
+  %add63 = add nsw i32 %y.i, 1
+  %arrayidx64 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
+  %tmp21 = load float addrspace(3)* %arrayidx64, align 4
+  %add67 = add nsw i32 %y.i, 32
+  %arrayidx68 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
+  %tmp22 = load float addrspace(3)* %arrayidx68, align 4
+  %add71 = add nsw i32 %y.i, 33
+  %arrayidx72 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
+  %tmp23 = load float addrspace(3)* %arrayidx72, align 4
+  %add75 = add nsw i32 %y.i, 64
+  %arrayidx76 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
+  %tmp24 = load float addrspace(3)* %arrayidx76, align 4
+  %add79 = add nsw i32 %y.i, 65
+  %arrayidx80 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
+  %tmp25 = load float addrspace(3)* %arrayidx80, align 4
+  %sum.0 = fadd float %tmp16, %tmp17
+  %sum.1 = fadd float %sum.0, %tmp18
+  %sum.2 = fadd float %sum.1, %tmp19
+  %sum.3 = fadd float %sum.2, %tmp20
+  %sum.4 = fadd float %sum.3, %tmp21
+  %sum.5 = fadd float %sum.4, %tmp22
+  %sum.6 = fadd float %sum.5, %tmp23
+  %sum.7 = fadd float %sum.6, %tmp24
+  %sum.8 = fadd float %sum.7, %tmp25
+  store float %sum.8, float addrspace(1)* %C, align 4
+  ret void
+}
+
+define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
+  %load = load <2 x i32> addrspace(3)* %in, align 4
+  store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
+  %load = load i64 addrspace(3)* %in, align 4
+  store i64 %load, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/R600/ds_read2st64.ll b/test/CodeGen/R600/ds_read2st64.ll
new file mode 100644
index 0000000..3e98e59
--- /dev/null
+++ b/test/CodeGen/R600/ds_read2st64.ll
@@ -0,0 +1,272 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+@lds.f64 = addrspace(3) global [512 x double] undef, align 8
+
+
+; SI-LABEL: @simple_read2st64_f32_0_1
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f32_1_2
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 128
+  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f32_max_offset
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 16320
+  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f32_over_max_offset
+; SI-NOT: ds_read2st64_b32
+; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 16384
+  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @odd_invalid_read2st64_f32_0
+; SI-NOT: ds_read2st64_b32
+; SI: s_endpgm
+define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 63
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @odd_invalid_read2st64_f32_1
+; SI-NOT: ds_read2st64_b32
+; SI: s_endpgm
+define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 127
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f64_0_1
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f64_1_2
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 128
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; Alignment only
+
+; SI-LABEL: @misaligned_read2st64_f64
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
+; SI: s_endpgm
+define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 4
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
+; SI-LABEL: @simple_read2st64_f64_max_offset
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 256
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8128
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f64_over_max_offset
+; SI-NOT: ds_read2st64_b64
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8192
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @invalid_read2st64_f64_odd_offset
+; SI-NOT: ds_read2st64_b64
+; SI: s_endpgm
+define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8129
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; The stride of 8 elements is 8 * 8 bytes. We need to make sure the
+; stride in elements, not bytes, is a multiple of 64.
+
+; SI-LABEL: @byte_size_only_divisible_64_read2_f64
+; SI-NOT: ds_read2st_b64
+; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:0 offset1:8
+; SI: s_endpgm
+define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/R600/ds_write2.ll b/test/CodeGen/R600/ds_write2.ll
new file mode 100644
index 0000000..1807fb5
--- /dev/null
+++ b/test/CodeGen/R600/ds_write2.ll
@@ -0,0 +1,425 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+@lds.f64 = addrspace(3) global [512 x double] undef, align 8
+
+
+; SI-LABEL: @simple_write2_one_val_f32
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr float addrspace(1)* %in, i32 %x.i
+  %val = load float addrspace(1)* %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_volatile_0
+; SI-NOT: ds_write2_b32
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float addrspace(1)* %in0.gep, align 4
+  %val1 = load float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_volatile_1
+; SI-NOT: ds_write2_b32
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float addrspace(1)* %in0.gep, align 4
+  %val1 = load float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; 2 data subregisters from different super registers.
+; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32
+; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
+; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
+; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr <2 x float> addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr <2 x float> addrspace(1)* %in.gep.0, i32 1
+  %val0 = load <2 x float> addrspace(1)* %in.gep.0, align 8
+  %val1 = load <2 x float> addrspace(1)* %in.gep.1, align 8
+  %val0.0 = extractelement <2 x float> %val0, i32 0
+  %val1.1 = extractelement <2 x float> %val1, i32 1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0.0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1.1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_subreg2_f32
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr <2 x float> addrspace(1)* %in, i32 %x.i
+  %val = load <2 x float> addrspace(1)* %in.gep, align 8
+  %val0 = extractelement <2 x float> %val, i32 0
+  %val1 = extractelement <2 x float> %val, i32 1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_subreg4_f32
+; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr <4 x float> addrspace(1)* %in, i32 %x.i
+  %val = load <4 x float> addrspace(1)* %in.gep, align 16
+  %val0 = extractelement <4 x float> %val, i32 0
+  %val1 = extractelement <4 x float> %val, i32 3
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_max_offset_f32
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:255 [M0]
+; SI: s_endpgm
+define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 255
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_too_far_f32
+; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
+; SI: s_endpgm
+define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float addrspace(1)* %in0.gep, align 4
+  %val1 = load float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 257
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_x2
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:0 offset1:8
+; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %tid.x
+  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %tid.x
+  %val0 = load float addrspace(1)* %in0.gep, align 4
+  %val1 = load float addrspace(1)* %in1.gep, align 4
+
+  %idx.0 = add nsw i32 %tid.x, 0
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  store float %val0, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  store float %val1, float addrspace(3)* %arrayidx3, align 4
+
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
+; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %tid.x
+  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %tid.x
+  %val0 = load float addrspace(1)* %in0.gep, align 4
+  %val1 = load float addrspace(1)* %in1.gep, align 4
+
+  %idx.0 = add nsw i32 %tid.x, 3
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  store float %val0, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  store float %val1, float addrspace(3)* %arrayidx3, align 4
+
+  ret void
+}
+
+; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32
+; SI-NOT: ds_write2_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: s_endpgm
+define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float addrspace(1)* %in0.gep, align 4
+  %val1 = load float addrspace(1)* %in1.gep, align 4
+
+  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
+  %gep = getelementptr inbounds <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+
+  ; Apply an additional offset after the vector that will be more obviously folded.
+  %gep.1.offset = getelementptr float addrspace(3)* %gep.1, i32 8
+  store float %val0, float addrspace(3)* %gep.0, align 4
+
+  %add.x = add nsw i32 %x.i, 8
+  store float %val1, float addrspace(3)* %gep.1.offset, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_one_val_f64
+; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
+; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr double addrspace(1)* %in, i32 %x.i
+  %val = load double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  store double %val, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  store double %val, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+; SI-LABEL: @misaligned_simple_write2_one_val_f64
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:1 [M0]
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 [M0]
+; SI: s_endpgm
+define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr double addrspace(1)* %in, i32 %x.i
+  %val = load double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
+  store double %val, double addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 7
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
+  store double %val, double addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f64
+; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr double addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr double addrspace(1)* %in.gep.0, i32 1
+  %val0 = load double addrspace(1)* %in.gep.0, align 8
+  %val1 = load double addrspace(1)* %in.gep.1, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  store double %val0, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  store double %val1, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+@foo = addrspace(3) global [4 x i32] undef, align 4
+
+; SI-LABEL: @store_constant_adjacent_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+define void @store_constant_adjacent_offsets() {
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
+  ret void
+}
+
+; SI-LABEL: @store_constant_disjoint_offsets
+; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
+; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset0:0 offset1:2
+define void @store_constant_disjoint_offsets() {
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
+  ret void
+}
+
+@bar = addrspace(3) global [4 x i64] undef, align 4
+
+; SI-LABEL: @store_misaligned64_constant_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+define void @store_misaligned64_constant_offsets() {
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
+  ret void
+}
+
+@bar.large = addrspace(3) global [4096 x i64] undef, align 4
+
+; SI-LABEL: @store_misaligned64_constant_large_offsets
+; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
+; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
+; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: s_endpgm
+define void @store_misaligned64_constant_large_offsets() {
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
+  ret void
+}
+
+@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
+@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
+
+define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %val = load float addrspace(1)* %in
+  %arrayidx44 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
+  store float %val, float addrspace(3)* %arrayidx44, align 4
+  %add47 = add nsw i32 %x.i, 1
+  %arrayidx48 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
+  store float %val, float addrspace(3)* %arrayidx48, align 4
+  %add51 = add nsw i32 %x.i, 16
+  %arrayidx52 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
+  store float %val, float addrspace(3)* %arrayidx52, align 4
+  %add55 = add nsw i32 %x.i, 17
+  %arrayidx56 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
+  store float %val, float addrspace(3)* %arrayidx56, align 4
+  %arrayidx60 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
+  store float %val, float addrspace(3)* %arrayidx60, align 4
+  %add63 = add nsw i32 %y.i, 1
+  %arrayidx64 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
+  store float %val, float addrspace(3)* %arrayidx64, align 4
+  %add67 = add nsw i32 %y.i, 32
+  %arrayidx68 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
+  store float %val, float addrspace(3)* %arrayidx68, align 4
+  %add71 = add nsw i32 %y.i, 33
+  %arrayidx72 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
+  store float %val, float addrspace(3)* %arrayidx72, align 4
+  %add75 = add nsw i32 %y.i, 64
+  %arrayidx76 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
+  store float %val, float addrspace(3)* %arrayidx76, align 4
+  %add79 = add nsw i32 %y.i, 65
+  %arrayidx80 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
+  store float %val, float addrspace(3)* %arrayidx80, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/R600/ds_write2st64.ll b/test/CodeGen/R600/ds_write2st64.ll
new file mode 100644
index 0000000..4cafb7c
--- /dev/null
+++ b/test/CodeGen/R600/ds_write2st64.ll
@@ -0,0 +1,119 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+
+
+; SI-LABEL: @simple_write2st64_one_val_f32_0_1
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:1 [M0]
+; SI: s_endpgm
+define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr float addrspace(1)* %in, i32 %x.i
+  %val = load float addrspace(1)* %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2st64_two_val_f32_2_5
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5 [M0]
+; SI: s_endpgm
+define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float addrspace(1)* %in.gep.1, align 4
+  %add.x.0 = add nsw i32 %x.i, 128
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 320
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2st64_two_val_max_offset_f32
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:255 [M0]
+; SI: s_endpgm
+define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 16320
+  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2st64_two_val_max_offset_f64
+; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
+; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
+; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 [M0]
+; SI: s_endpgm
+define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr double addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr double addrspace(1)* %in.gep.0, i32 1
+  %val0 = load double addrspace(1)* %in.gep.0, align 8
+  %val1 = load double addrspace(1)* %in.gep.1, align 8
+  %add.x.0 = add nsw i32 %x.i, 256
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
+  store double %val0, double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8128
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
+  store double %val1, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64
+; SI-NOT: ds_write2st64_b64
+; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:0 offset1:8
+; SI: s_endpgm
+define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr double addrspace(1)* %in, i32 %x.i
+  %val = load double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
+  store double %val, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
+  store double %val, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/R600/elf.ll b/test/CodeGen/R600/elf.ll
index 9385150..6c521d0 100644
--- a/test/CodeGen/R600/elf.ll
+++ b/test/CodeGen/R600/elf.ll
@@ -5,6 +5,8 @@
 ; ELF-CHECK: Name: .AMDGPU.config
 ; ELF-CHECK: Type: SHT_PROGBITS
 
+; CONFIG-CHECK: .align 256
+; CONFIG-CHECK: test:
 ; CONFIG-CHECK: .section .AMDGPU.config
 ; CONFIG-CHECK-NEXT: .long   45096
 ; CONFIG-CHECK-NEXT: .long   0
diff --git a/test/CodeGen/R600/empty-function.ll b/test/CodeGen/R600/empty-function.ll
new file mode 100644
index 0000000..d4ff803
--- /dev/null
+++ b/test/CodeGen/R600/empty-function.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; Make sure we don't assert on empty functions
+
+; SI-LABEL: {{^}}empty_function_ret:
+; SI: .text
+; SI: s_endpgm
+; SI: codeLenInByte = 4
+define void @empty_function_ret() #0 {
+  ret void
+}
+
+; SI-LABEL: {{^}}empty_function_unreachable:
+; SI: .text
+; SI: codeLenInByte = 0
+define void @empty_function_unreachable() #0 {
+  unreachable
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
index dc056e0..5bda8f8 100644
--- a/test/CodeGen/R600/extload.ll
+++ b/test/CodeGen/R600/extload.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @anyext_load_i8:
+; FUNC-LABEL: {{^}}anyext_load_i8:
 ; EG: AND_INT
 ; EG: 255
 define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
@@ -13,7 +13,7 @@ define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspac
   ret void
 }
 
-; FUNC-LABEL: @anyext_load_i16:
+; FUNC-LABEL: {{^}}anyext_load_i16:
 ; EG: AND_INT
 ; EG: AND_INT
 ; EG-DAG: 65535
@@ -27,7 +27,7 @@ define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrs
   ret void
 }
 
-; FUNC-LABEL: @anyext_load_lds_i8:
+; FUNC-LABEL: {{^}}anyext_load_lds_i8:
 ; EG: AND_INT
 ; EG: 255
 define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
@@ -39,7 +39,7 @@ define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addr
   ret void
 }
 
-; FUNC-LABEL: @anyext_load_lds_i16:
+; FUNC-LABEL: {{^}}anyext_load_lds_i16:
 ; EG: AND_INT
 ; EG: AND_INT
 ; EG-DAG: 65535
@@ -53,10 +53,10 @@ define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 a
   ret void
 }
 
-; FUNC-LABEL: @sextload_global_i8_to_i64
-; SI: BUFFER_LOAD_SBYTE [[LOAD:v[0-9]+]],
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sextload_global_i8_to_i64:
+; SI: buffer_load_sbyte [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
 define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %a = load i8 addrspace(1)* %in, align 8
   %ext = sext i8 %a to i64
@@ -64,10 +64,10 @@ define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
   ret void
 }
 
-; FUNC-LABEL: @sextload_global_i16_to_i64
-; SI: BUFFER_LOAD_SSHORT [[LOAD:v[0-9]+]],
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sextload_global_i16_to_i64:
+; SI: buffer_load_sshort [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
 define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16 addrspace(1)* %in, align 8
   %ext = sext i16 %a to i64
@@ -75,10 +75,10 @@ define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
   ret void
 }
 
-; FUNC-LABEL: @sextload_global_i32_to_i64
-; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sextload_global_i32_to_i64:
+; SI: buffer_load_dword [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
 define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %a = load i32 addrspace(1)* %in, align 8
   %ext = sext i32 %a to i64
@@ -86,11 +86,11 @@ define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
   ret void
 }
 
-; FUNC-LABEL: @zextload_global_i8_to_i64
-; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
-; SI: BUFFER_LOAD_UBYTE [[LOAD:v[0-9]+]],
-; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}zextload_global_i8_to_i64:
+; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
+; SI-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]]
+; SI: buffer_store_dwordx2
 define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %a = load i8 addrspace(1)* %in, align 8
   %ext = zext i8 %a to i64
@@ -98,11 +98,11 @@ define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
   ret void
 }
 
-; FUNC-LABEL: @zextload_global_i16_to_i64
-; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
-; SI: BUFFER_LOAD_USHORT [[LOAD:v[0-9]+]],
-; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
+; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
+; SI-DAG: buffer_load_ushort [[LOAD:v[0-9]+]],
+; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]]
+; SI: buffer_store_dwordx2
 define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16 addrspace(1)* %in, align 8
   %ext = zext i16 %a to i64
@@ -110,11 +110,11 @@ define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
   ret void
 }
 
-; FUNC-LABEL: @zextload_global_i32_to_i64
-; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
-; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
-; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
+; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
+; SI-DAG: buffer_load_dword [[LOAD:v[0-9]+]],
+; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]]
+; SI: buffer_store_dwordx2
 define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %a = load i32 addrspace(1)* %in, align 8
   %ext = zext i32 %a to i64
diff --git a/test/CodeGen/R600/extract_vector_elt_i16.ll b/test/CodeGen/R600/extract_vector_elt_i16.ll
index 5cd1b04..efdc1c8 100644
--- a/test/CodeGen/R600/extract_vector_elt_i16.ll
+++ b/test/CodeGen/R600/extract_vector_elt_i16.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @extract_vector_elt_v2i16
-; SI: BUFFER_LOAD_USHORT
-; SI: BUFFER_STORE_SHORT
-; SI: BUFFER_LOAD_USHORT
-; SI: BUFFER_STORE_SHORT
+; FUNC-LABEL: {{^}}extract_vector_elt_v2i16:
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_store_short
+; SI: buffer_store_short
 define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind {
   %p0 = extractelement <2 x i16> %foo, i32 0
   %p1 = extractelement <2 x i16> %foo, i32 1
@@ -14,11 +14,11 @@ define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) no
   ret void
 }
 
-; FUNC-LABEL: @extract_vector_elt_v4i16
-; SI: BUFFER_LOAD_USHORT
-; SI: BUFFER_STORE_SHORT
-; SI: BUFFER_LOAD_USHORT
-; SI: BUFFER_STORE_SHORT
+; FUNC-LABEL: {{^}}extract_vector_elt_v4i16:
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_store_short
+; SI: buffer_store_short
 define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind {
   %p0 = extractelement <4 x i16> %foo, i32 0
   %p1 = extractelement <4 x i16> %foo, i32 2
diff --git a/test/CodeGen/R600/fabs.f64.ll b/test/CodeGen/R600/fabs.f64.ll
new file mode 100644
index 0000000..d2ba320
--- /dev/null
+++ b/test/CodeGen/R600/fabs.f64.ll
@@ -0,0 +1,97 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+declare double @fabs(double) readnone
+declare double @llvm.fabs.f64(double) readnone
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
+declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
+
+; FUNC-LABEL: {{^}}v_fabs_f64:
+; SI: v_and_b32
+; SI: s_endpgm
+define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tidext = sext i32 %tid to i64
+  %gep = getelementptr double addrspace(1)* %in, i64 %tidext
+  %val = load double addrspace(1)* %gep, align 8
+  %fabs = call double @llvm.fabs.f64(double %val)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_f64:
+; SI: v_and_b32
+; SI-NOT: v_and_b32
+; SI: s_endpgm
+define void @fabs_f64(double addrspace(1)* %out, double %in) {
+  %fabs = call double @llvm.fabs.f64(double %in)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v2f64:
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
+  store <2 x double> %fabs, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v4f64:
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
+  store <4 x double> %fabs, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fabs_fold_f64:
+; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NOT: and
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
+; SI: s_endpgm
+define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+  %fabs = call double @llvm.fabs.f64(double %in0)
+  %fmul = fmul double %fabs, %in1
+  store double %fmul, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fabs_fn_fold_f64:
+; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NOT: and
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
+; SI: s_endpgm
+define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+  %fabs = call double @fabs(double %in0)
+  %fmul = fmul double %fabs, %in1
+  store double %fmul, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_free_f64:
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc= bitcast i64 %in to double
+  %fabs = call double @llvm.fabs.f64(double %bc)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_fn_free_f64:
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc= bitcast i64 %in to double
+  %fabs = call double @fabs(double %bc)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
index b87ce22..06cc97f 100644
--- a/test/CodeGen/R600/fabs.ll
+++ b/test/CodeGen/R600/fabs.ll
@@ -1,65 +1,98 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
 
 ; DAGCombiner will transform:
 ; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
 ; unless isFabsFree returns true
 
-; R600-CHECK-LABEL: @fabs_free
-; R600-CHECK-NOT: AND
-; R600-CHECK: |PV.{{[XYZW]}}|
-; SI-CHECK-LABEL: @fabs_free
-; SI-CHECK: V_AND_B32
+; FUNC-LABEL: {{^}}fabs_fn_free:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+
+; SI: v_and_b32
+
+define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
+  %bc= bitcast i32 %in to float
+  %fabs = call float @fabs(float %bc)
+  store float %fabs, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_free:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+
+; SI: v_and_b32
 
 define void @fabs_free(float addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = bitcast i32 %in to float
-  %1 = call float @fabs(float %0)
-  store float %1, float addrspace(1)* %out
+  %bc= bitcast i32 %in to float
+  %fabs = call float @llvm.fabs.f32(float %bc)
+  store float %fabs, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+
+; SI: v_and_b32
+define void @fabs_f32(float addrspace(1)* %out, float %in) {
+  %fabs = call float @llvm.fabs.f32(float %in)
+  store float %fabs, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v2f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+
+; SI: v_and_b32
+; SI: v_and_b32
+define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  store <2 x float> %fabs, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK-LABEL: @fabs_v2
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; SI-CHECK-LABEL: @fabs_v2
-; SI-CHECK: V_AND_B32
-; SI-CHECK: V_AND_B32
-define void @fabs_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+; FUNC-LABEL: {{^}}fabs_v4f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: v_and_b32
+define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+  %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  store <4 x float> %fabs, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK-LABEL: @fabs_v4
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; SI-CHECK-LABEL: @fabs_v4
-; SI-CHECK: V_AND_B32
-; SI-CHECK: V_AND_B32
-; SI-CHECK: V_AND_B32
-; SI-CHECK: V_AND_B32
-define void @fabs_v4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
+; SI-LABEL: {{^}}fabs_fn_fold:
+; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; SI-NOT: and
+; SI: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
+  %fabs = call float @fabs(float %in0)
+  %fmul = fmul float %fabs, %in1
+  store float %fmul, float addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @fabs_fold
-; SI-CHECK-NOT: V_AND_B32_e32
-; SI-CHECK: V_MUL_F32_e64 v{{[0-9]+}}, s{{[0-9]+}}, |v{{[0-9]+}}|
+; SI-LABEL: {{^}}fabs_fold:
+; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; SI-NOT: and
+; SI: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
 define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
-entry:
-  %0 = call float @fabs(float %in0)
-  %1 = fmul float %0, %in1
-  store float %1, float addrspace(1)* %out
+  %fabs = call float @llvm.fabs.f32(float %in0)
+  %fmul = fmul float %fabs, %in1
+  store float %fmul, float addrspace(1)* %out
   ret void
 }
 
-declare float @fabs(float ) readnone
-declare <2 x float> @llvm.fabs.v2f32(<2 x float> ) readnone
-declare <4 x float> @llvm.fabs.v4f32(<4 x float> ) readnone
+declare float @fabs(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
index 5d2b806..774dd0b 100644
--- a/test/CodeGen/R600/fadd.ll
+++ b/test/CodeGen/R600/fadd.ll
@@ -1,66 +1,63 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 
-; FUNC-LABEL: @fadd_f32
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI-CHECK: V_ADD_F32
+; FUNC-LABEL: {{^}}fadd_f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI: v_add_f32
 define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
-entry:
-   %0 = fadd float %a, %b
-   store float %0, float addrspace(1)* %out
+   %add = fadd float %a, %b
+   store float %add, float addrspace(1)* %out, align 4
    ret void
 }
 
-; FUNC-LABEL: @fadd_v2f32
-; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
+; FUNC-LABEL: {{^}}fadd_v2f32:
+; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; SI: v_add_f32
+; SI: v_add_f32
 define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
-entry:
-  %0 = fadd <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %add = fadd <2 x float> %a, %b
+  store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @fadd_v4f32
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
+; FUNC-LABEL: {{^}}fadd_v4f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
 define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float> addrspace(1) * %in
-  %b = load <4 x float> addrspace(1) * %b_ptr
+  %a = load <4 x float> addrspace(1)* %in, align 16
+  %b = load <4 x float> addrspace(1)* %b_ptr, align 16
   %result = fadd <4 x float> %a, %b
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; FUNC-LABEL: @fadd_v8f32
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
+; FUNC-LABEL: {{^}}fadd_v8f32:
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
 define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) {
-entry:
-  %0 = fadd <8 x float> %a, %b
-  store <8 x float> %0, <8 x float> addrspace(1)* %out
+  %add = fadd <8 x float> %a, %b
+  store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32
   ret void
 }
diff --git a/test/CodeGen/R600/fadd64.ll b/test/CodeGen/R600/fadd64.ll
index 48cd3cf..3ca8500 100644
--- a/test/CodeGen/R600/fadd64.ll
+++ b/test/CodeGen/R600/fadd64.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
-; CHECK: @fadd_f64
-; CHECK: V_ADD_F64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
+; CHECK: {{^}}fadd_f64:
+; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
 
 define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
diff --git a/test/CodeGen/R600/fceil.ll b/test/CodeGen/R600/fceil.ll
index 458363a..56dc796 100644
--- a/test/CodeGen/R600/fceil.ll
+++ b/test/CodeGen/R600/fceil.ll
@@ -8,8 +8,8 @@ declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
 declare <8 x float> @llvm.ceil.v8f32(<8 x float>) nounwind readnone
 declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone
 
-; FUNC-LABEL: @fceil_f32:
-; SI: V_CEIL_F32_e32
+; FUNC-LABEL: {{^}}fceil_f32:
+; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
 define void @fceil_f32(float addrspace(1)* %out, float %x) {
@@ -18,9 +18,9 @@ define void @fceil_f32(float addrspace(1)* %out, float %x) {
   ret void
 }
 
-; FUNC-LABEL: @fceil_v2f32:
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
+; FUNC-LABEL: {{^}}fceil_v2f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
@@ -30,10 +30,10 @@ define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
   ret void
 }
 
-; FUNC-LABEL: @fceil_v3f32:
-; FIXME-SI: V_CEIL_F32_e32
-; FIXME-SI: V_CEIL_F32_e32
-; FIXME-SI: V_CEIL_F32_e32
+; FUNC-LABEL: {{^}}fceil_v3f32:
+; FIXME-SI: v_ceil_f32_e32
+; FIXME-SI: v_ceil_f32_e32
+; FIXME-SI: v_ceil_f32_e32
 ; FIXME-EG: v3 is treated as v2 and v1, hence 2 stores
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
@@ -46,11 +46,11 @@ define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
   ret void
 }
 
-; FUNC-LABEL: @fceil_v4f32:
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
+; FUNC-LABEL: {{^}}fceil_v4f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
@@ -62,15 +62,15 @@ define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
   ret void
 }
 
-; FUNC-LABEL: @fceil_v8f32:
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
+; FUNC-LABEL: {{^}}fceil_v8f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
 ; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
@@ -87,23 +87,23 @@ define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
   ret void
 }
 
-; FUNC-LABEL: @fceil_v16f32:
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
+; FUNC-LABEL: {{^}}fceil_v16f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT3:T[0-9]+]]{{\.[XYZW]}}
diff --git a/test/CodeGen/R600/fceil64.ll b/test/CodeGen/R600/fceil64.ll
index b42aefa..029f41d 100644
--- a/test/CodeGen/R600/fceil64.ll
+++ b/test/CodeGen/R600/fceil64.ll
@@ -8,94 +8,94 @@ declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone
 declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone
 declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 
-; FUNC-LABEL: @fceil_f64:
-; CI: V_CEIL_F64_e32
-; SI: S_BFE_I32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
-; SI: S_ADD_I32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
-; SI: S_LSHR_B64
-; SI: S_NOT_B64
-; SI: S_AND_B64
-; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: CMP_LT_I32
-; SI: CNDMASK_B32
-; SI: CNDMASK_B32
-; SI: CMP_GT_I32
-; SI: CNDMASK_B32
-; SI: CNDMASK_B32
-; SI: CMP_GT_F64
-; SI: CNDMASK_B32
-; SI: CMP_NE_I32
-; SI: CNDMASK_B32
-; SI: CNDMASK_B32
-; SI: V_ADD_F64
+; FUNC-LABEL: {{^}}fceil_f64:
+; CI: v_ceil_f64_e32
+; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: s_lshr_b64
+; SI: s_not_b64
+; SI: s_and_b64
+; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI-DAG: cmp_lt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_gt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_gt_f64
+; SI: cndmask_b32
+; SI: cmp_ne_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: v_add_f64
 define void @fceil_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.ceil.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @fceil_v2f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
+; FUNC-LABEL: {{^}}fceil_v2f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
 define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
 }
 
-; FIXME-FUNC-LABEL: @fceil_v3f64:
-; FIXME-CI: V_CEIL_F64_e32
-; FIXME-CI: V_CEIL_F64_e32
-; FIXME-CI: V_CEIL_F64_e32
+; FIXME-FUNC-LABEL: {{^}}fceil_v3f64:
+; FIXME-CI: v_ceil_f64_e32
+; FIXME-CI: v_ceil_f64_e32
+; FIXME-CI: v_ceil_f64_e32
 ; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
 ; }
 
-; FUNC-LABEL: @fceil_v4f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
+; FUNC-LABEL: {{^}}fceil_v4f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
 define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @fceil_v8f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
+; FUNC-LABEL: {{^}}fceil_v8f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
 define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @fceil_v16f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
+; FUNC-LABEL: {{^}}fceil_v16f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
 define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll
index c76a758..3399218 100644
--- a/test/CodeGen/R600/fcmp.ll
+++ b/test/CodeGen/R600/fcmp.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: @fcmp_sext
+; CHECK: {{^}}fcmp_sext:
 ; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
@@ -18,7 +18,7 @@ entry:
 ; SET*_DX10 instruction.  Previously we were lowering this to:
 ; SET* + FP_TO_SINT
 
-; CHECK: @fcmp_br
+; CHECK: {{^}}fcmp_br:
 ; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}}
 ; CHECK-NEXT {{[0-9]+(5.0}}
 
diff --git a/test/CodeGen/R600/fcmp64.ll b/test/CodeGen/R600/fcmp64.ll
index bcc7a8c..dc24443 100644
--- a/test/CodeGen/R600/fcmp64.ll
+++ b/test/CodeGen/R600/fcmp64.ll
@@ -1,60 +1,55 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
-; CHECK: @flt_f64
-; CHECK: V_CMP_LT_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-
-define void @flt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+; CHECK-LABEL: {{^}}flt_f64:
+; CHECK: v_cmp_lt_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
    %r1 = load double addrspace(1)* %in2
    %r2 = fcmp ult double %r0, %r1
-   %r3 = select i1 %r2, double %r0, double %r1
-   store double %r3, double addrspace(1)* %out
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
    ret void
 }
 
-; CHECK: @fle_f64
-; CHECK: V_CMP_LE_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-
-define void @fle_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+; CHECK-LABEL: {{^}}fle_f64:
+; CHECK: v_cmp_le_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
    %r1 = load double addrspace(1)* %in2
    %r2 = fcmp ule double %r0, %r1
-   %r3 = select i1 %r2, double %r0, double %r1
-   store double %r3, double addrspace(1)* %out
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
    ret void
 }
 
-; CHECK: @fgt_f64
-; CHECK: V_CMP_GT_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-
-define void @fgt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+; CHECK-LABEL: {{^}}fgt_f64:
+; CHECK: v_cmp_gt_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
    %r1 = load double addrspace(1)* %in2
    %r2 = fcmp ugt double %r0, %r1
-   %r3 = select i1 %r2, double %r0, double %r1
-   store double %r3, double addrspace(1)* %out
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
    ret void
 }
 
-; CHECK: @fge_f64
-; CHECK: V_CMP_GE_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-
-define void @fge_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+; CHECK-LABEL: {{^}}fge_f64:
+; CHECK: v_cmp_ge_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
    %r1 = load double addrspace(1)* %in2
    %r2 = fcmp uge double %r0, %r1
-   %r3 = select i1 %r2, double %r0, double %r1
-   store double %r3, double addrspace(1)* %out
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
    ret void
 }
 
-; CHECK: @fne_f64
-; CHECK: V_CMP_NEQ_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-
+; CHECK-LABEL: {{^}}fne_f64:
+; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
@@ -65,9 +60,8 @@ define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
    ret void
 }
 
-; CHECK: @feq_f64
-; CHECK: V_CMP_EQ_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-
+; CHECK-LABEL: {{^}}feq_f64:
+; CHECK: v_cmp_eq_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
diff --git a/test/CodeGen/R600/fconst64.ll b/test/CodeGen/R600/fconst64.ll
index 9c3a7e3..097c89f 100644
--- a/test/CodeGen/R600/fconst64.ll
+++ b/test/CodeGen/R600/fconst64.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
-; CHECK: @fconst_f64
-; CHECK-DAG: S_MOV_B32 {{s[0-9]+}}, 0x40140000
-; CHECK-DAG: S_MOV_B32 {{s[0-9]+}}, 0
+; CHECK: {{^}}fconst_f64:
+; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000
+; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
 
 define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r1 = load double addrspace(1)* %in
diff --git a/test/CodeGen/R600/fcopysign.f32.ll b/test/CodeGen/R600/fcopysign.f32.ll
index 7b4425b..897830e 100644
--- a/test/CodeGen/R600/fcopysign.f32.ll
+++ b/test/CodeGen/R600/fcopysign.f32.ll
@@ -7,15 +7,15 @@ declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) nounwind read
 declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind readnone
 
 ; Try to identify arg based on higher address.
-; FUNC-LABEL: @test_copysign_f32:
-; SI: S_LOAD_DWORD [[SSIGN:s[0-9]+]], {{.*}} 0xc
-; SI: V_MOV_B32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
-; SI-DAG: S_LOAD_DWORD [[SMAG:s[0-9]+]], {{.*}} 0xb
-; SI-DAG: V_MOV_B32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
-; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff
-; SI: V_BFI_B32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}test_copysign_f32:
+; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb
+; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc
+; SI-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
+; SI-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
+; SI-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
+; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BFI_INT
 define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
@@ -24,8 +24,8 @@ define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign
   ret void
 }
 
-; FUNC-LABEL: @test_copysign_v2f32:
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}test_copysign_v2f32:
+; SI: s_endpgm
 
 ; EG: BFI_INT
 ; EG: BFI_INT
@@ -35,8 +35,8 @@ define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %ma
   ret void
 }
 
-; FUNC-LABEL: @test_copysign_v4f32:
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}test_copysign_v4f32:
+; SI: s_endpgm
 
 ; EG: BFI_INT
 ; EG: BFI_INT
diff --git a/test/CodeGen/R600/fcopysign.f64.ll b/test/CodeGen/R600/fcopysign.f64.ll
index ea7a6db..90f0ce3 100644
--- a/test/CodeGen/R600/fcopysign.f64.ll
+++ b/test/CodeGen/R600/fcopysign.f64.ll
@@ -4,32 +4,32 @@ declare double @llvm.copysign.f64(double, double) nounwind readnone
 declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone
 declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
 
-; FUNC-LABEL: @test_copysign_f64:
-; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI: V_MOV_B32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
-; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: V_MOV_B32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
-; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff
-; SI: V_BFI_B32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
-; SI: V_MOV_B32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
-; SI: BUFFER_STORE_DWORDX2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}test_copysign_f64:
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
+; SI-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
+; SI-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
+; SI: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
+; SI: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
+; SI: s_endpgm
 define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @test_copysign_v2f64:
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}test_copysign_v2f64:
+; SI: s_endpgm
 define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
   %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @test_copysign_v4f64:
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}test_copysign_v4f64:
+; SI: s_endpgm
 define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
   %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 3d21524..5321fdb 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@@ -1,20 +1,37 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; These tests check that fdiv is expanded correctly and also test that the
 ; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
 ; instruction groups.
 
-; R600-CHECK: @fdiv_v2f32
-; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
-; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
-; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
-; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
-; SI-CHECK: @fdiv_v2f32
-; SI-CHECK-DAG: V_RCP_F32
-; SI-CHECK-DAG: V_MUL_F32
-; SI-CHECK-DAG: V_RCP_F32
-; SI-CHECK-DAG: V_MUL_F32
+; FUNC-LABEL: {{^}}fdiv_f32:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fdiv float %a, %b
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+
+
+; FUNC-LABEL: {{^}}fdiv_v2f32:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fdiv <2 x float> %a, %b
@@ -22,24 +39,24 @@ entry:
   ret void
 }
 
-; R600-CHECK: @fdiv_v4f32
-; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; SI-CHECK: @fdiv_v4f32
-; SI-CHECK-DAG: V_RCP_F32
-; SI-CHECK-DAG: V_MUL_F32
-; SI-CHECK-DAG: V_RCP_F32
-; SI-CHECK-DAG: V_MUL_F32
-; SI-CHECK-DAG: V_RCP_F32
-; SI-CHECK-DAG: V_MUL_F32
-; SI-CHECK-DAG: V_RCP_F32
-; SI-CHECK-DAG: V_MUL_F32
+; FUNC-LABEL: {{^}}fdiv_v4f32:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fdiv64.ll b/test/CodeGen/R600/fdiv64.ll
index 79b5c8b..d424898 100644
--- a/test/CodeGen/R600/fdiv64.ll
+++ b/test/CodeGen/R600/fdiv64.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
-; CHECK: @fdiv_f64
-; CHECK: V_RCP_F64_e32 {{v\[[0-9]+:[0-9]+\]}}
-; CHECK: V_MUL_F64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: {{^}}fdiv_f64:
+; CHECK: v_rcp_f64_e32 {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 
 define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
diff --git a/test/CodeGen/R600/fetch-limits.r600.ll b/test/CodeGen/R600/fetch-limits.r600.ll
index f78d1d9..d35573e 100644
--- a/test/CodeGen/R600/fetch-limits.r600.ll
+++ b/test/CodeGen/R600/fetch-limits.r600.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=rv670 | FileCheck %s
 
 ; R600 supports 8 fetches in a clause
-; CHECK: @fetch_limits_r600
+; CHECK: {{^}}fetch_limits_r600:
 ; CHECK: Fetch clause
 ; CHECK: Fetch clause
 
diff --git a/test/CodeGen/R600/fetch-limits.r700+.ll b/test/CodeGen/R600/fetch-limits.r700+.ll
index 1a8a43f..17760a0 100644
--- a/test/CodeGen/R600/fetch-limits.r700+.ll
+++ b/test/CodeGen/R600/fetch-limits.r700+.ll
@@ -12,7 +12,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
 
 ; r700+ supports 16 fetches in a clause
-; CHECK: @fetch_limits_r700
+; CHECK: {{^}}fetch_limits_r700:
 ; CHECK: Fetch clause
 ; CHECK: Fetch clause
 
diff --git a/test/CodeGen/R600/ffloor.ll b/test/CodeGen/R600/ffloor.ll
index 31c6116..166f705 100644
--- a/test/CodeGen/R600/ffloor.ll
+++ b/test/CodeGen/R600/ffloor.ll
@@ -8,95 +8,95 @@ declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone
 declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone
 declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
 
-; FUNC-LABEL: @ffloor_f64:
-; CI: V_FLOOR_F64_e32
+; FUNC-LABEL: {{^}}ffloor_f64:
+; CI: v_floor_f64_e32
 
-; SI: S_BFE_I32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
-; SI: S_ADD_I32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
-; SI: S_LSHR_B64
-; SI: S_NOT_B64
-; SI: S_AND_B64
-; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: CMP_LT_I32
-; SI: CNDMASK_B32
-; SI: CNDMASK_B32
-; SI: CMP_GT_I32
-; SI: CNDMASK_B32
-; SI: CNDMASK_B32
-; SI: CMP_LT_F64
-; SI: CNDMASK_B32
-; SI: CMP_NE_I32
-; SI: CNDMASK_B32
-; SI: CNDMASK_B32
-; SI: V_ADD_F64
+; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: s_lshr_b64
+; SI: s_not_b64
+; SI: s_and_b64
+; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI-DAG: cmp_lt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_gt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_lt_f64
+; SI: cndmask_b32
+; SI: cmp_ne_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: v_add_f64
 define void @ffloor_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.floor.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @ffloor_v2f64:
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
+; FUNC-LABEL: {{^}}ffloor_v2f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
 define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
 }
 
-; FIXME-FUNC-LABEL: @ffloor_v3f64:
-; FIXME-CI: V_FLOOR_F64_e32
-; FIXME-CI: V_FLOOR_F64_e32
-; FIXME-CI: V_FLOOR_F64_e32
+; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64:
+; FIXME-CI: v_floor_f64_e32
+; FIXME-CI: v_floor_f64_e32
+; FIXME-CI: v_floor_f64_e32
 ; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
 ; }
 
-; FUNC-LABEL: @ffloor_v4f64:
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
+; FUNC-LABEL: {{^}}ffloor_v4f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
 define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @ffloor_v8f64:
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
+; FUNC-LABEL: {{^}}ffloor_v8f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
 define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @ffloor_v16f64:
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
+; FUNC-LABEL: {{^}}ffloor_v16f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
 define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
diff --git a/test/CodeGen/R600/flat-address-space.ll b/test/CodeGen/R600/flat-address-space.ll
new file mode 100644
index 0000000..fc5af7c
--- /dev/null
+++ b/test/CodeGen/R600/flat-address-space.ll
@@ -0,0 +1,182 @@
+; RUN: llc -O0 -march=r600 -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; RUN: llc -O0 -march=r600 -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+
+; Disable optimizations in case there are optimizations added that
+; specialize away generic pointer accesses.
+
+
+; CHECK-LABEL: {{^}}branch_use_flat_i32:
+; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, [M0, FLAT_SCRATCH]
+; CHECK: s_endpgm
+define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+entry:
+  %cmp = icmp ne i32 %c, 0
+  br i1 %cmp, label %local, label %global
+
+local:
+  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
+  br label %end
+
+global:
+  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  br label %end
+
+end:
+  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+;  %val = load i32 addrspace(4)* %fptr, align 4
+;  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+
+; These testcases might become useless when there are optimizations to
+; remove generic pointers.
+
+; CHECK-LABEL: {{^}}store_flat_i32:
+; CHECK: v_mov_b32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_i64:
+; CHECK: flat_store_dwordx2
+define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
+  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
+  store i64 %x, i64 addrspace(4)* %fptr, align 8
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_v4i32:
+; CHECK: flat_store_dwordx4
+define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
+  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
+  store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_trunc_i16:
+; CHECK: flat_store_short
+define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %y = trunc i32 %x to i16
+  store i16 %y, i16 addrspace(4)* %fptr, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_trunc_i8:
+; CHECK: flat_store_byte
+define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %y = trunc i32 %x to i8
+  store i8 %y, i8 addrspace(4)* %fptr, align 2
+  ret void
+}
+
+
+
+; CHECK-LABEL @load_flat_i32:
+; CHECK: flat_load_dword
+define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  %fload = load i32 addrspace(4)* %fptr, align 4
+  store i32 %fload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @load_flat_i64:
+; CHECK: flat_load_dwordx2
+define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
+  %fload = load i64 addrspace(4)* %fptr, align 4
+  store i64 %fload, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL @load_flat_v4i32:
+; CHECK: flat_load_dwordx4
+define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
+  %fload = load <4 x i32> addrspace(4)* %fptr, align 4
+  store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL @sextload_flat_i8:
+; CHECK: flat_load_sbyte
+define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %fload = load i8 addrspace(4)* %fptr, align 4
+  %ext = sext i8 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @zextload_flat_i8:
+; CHECK: flat_load_ubyte
+define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %fload = load i8 addrspace(4)* %fptr, align 4
+  %ext = zext i8 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @sextload_flat_i16:
+; CHECK: flat_load_sshort
+define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %fload = load i16 addrspace(4)* %fptr, align 4
+  %ext = sext i16 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @zextload_flat_i16:
+; CHECK: flat_load_ushort
+define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %fload = load i16 addrspace(4)* %fptr, align 4
+  %ext = zext i16 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+
+; TODO: This should not be zero when registers are used for small
+; scratch allocations again.
+
+; Check for prologue initializing special SGPRs pointing to scratch.
+; CHECK-LABEL: {{^}}store_flat_scratch:
+; CHECK: s_movk_i32 flat_scratch_lo, 0
+; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
+; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
+; CHECK: flat_store_dword
+; CHECK: s_barrier
+; CHECK: flat_load_dword
+define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+  %alloca = alloca i32, i32 9, align 4
+  %x = call i32 @llvm.r600.read.tidig.x() #3
+  %pptr = getelementptr i32* %alloca, i32 %x
+  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr
+  ; Dummy call
+  call void @llvm.AMDGPU.barrier.local() #1
+  %reload = load i32 addrspace(4)* %fptr, align 4
+  store i32 %reload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local() #1
+declare i32 @llvm.r600.read.tidig.x() #3
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noduplicate }
+attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fma.f64.ll b/test/CodeGen/R600/fma.f64.ll
new file mode 100644
index 0000000..4b0ab76
--- /dev/null
+++ b/test/CodeGen/R600/fma.f64.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+
+
+; FUNC-LABEL: {{^}}fma_f64:
+; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2, double addrspace(1)* %in3) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = load double addrspace(1)* %in3
+   %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2)
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fma_v2f64:
+; SI: v_fma_f64
+; SI: v_fma_f64
+define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                       <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
+   %r0 = load <2 x double> addrspace(1)* %in1
+   %r1 = load <2 x double> addrspace(1)* %in2
+   %r2 = load <2 x double> addrspace(1)* %in3
+   %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2)
+   store <2 x double> %r3, <2 x double> addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fma_v4f64:
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+                       <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
+   %r0 = load <4 x double> addrspace(1)* %in1
+   %r1 = load <4 x double> addrspace(1)* %in2
+   %r2 = load <4 x double> addrspace(1)* %in3
+   %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2)
+   store <4 x double> %r3, <4 x double> addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/R600/fma.ll b/test/CodeGen/R600/fma.ll
index d72ffec..637e799 100644
--- a/test/CodeGen/R600/fma.ll
+++ b/test/CodeGen/R600/fma.ll
@@ -1,89 +1,92 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.fma.f32(float, float, float) nounwind readnone
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
 
-declare double @llvm.fma.f64(double, double, double) nounwind readnone
-declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-; FUNC-LABEL: @fma_f32
-; SI: V_FMA_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+; FUNC-LABEL: {{^}}fma_f32:
+; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}},
+; EG: FMA {{\*? *}}[[RES]]
 define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                      float addrspace(1)* %in2, float addrspace(1)* %in3) {
-   %r0 = load float addrspace(1)* %in1
-   %r1 = load float addrspace(1)* %in2
-   %r2 = load float addrspace(1)* %in3
-   %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
-   store float %r3, float addrspace(1)* %out
-   ret void
+  %r0 = load float addrspace(1)* %in1
+  %r1 = load float addrspace(1)* %in2
+  %r2 = load float addrspace(1)* %in3
+  %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %out
+  ret void
 }
 
-; FUNC-LABEL: @fma_v2f32
-; SI: V_FMA_F32
-; SI: V_FMA_F32
+; FUNC-LABEL: {{^}}fma_v2f32:
+; SI: v_fma_f32
+; SI: v_fma_f32
+
+; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}},
+; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]]
+; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
 define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                        <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
-   %r0 = load <2 x float> addrspace(1)* %in1
-   %r1 = load <2 x float> addrspace(1)* %in2
-   %r2 = load <2 x float> addrspace(1)* %in3
-   %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
-   store <2 x float> %r3, <2 x float> addrspace(1)* %out
-   ret void
+  %r0 = load <2 x float> addrspace(1)* %in1
+  %r1 = load <2 x float> addrspace(1)* %in2
+  %r2 = load <2 x float> addrspace(1)* %in3
+  %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
+  store <2 x float> %r3, <2 x float> addrspace(1)* %out
+  ret void
 }
 
-; FUNC-LABEL: @fma_v4f32
-; SI: V_FMA_F32
-; SI: V_FMA_F32
-; SI: V_FMA_F32
-; SI: V_FMA_F32
+; FUNC-LABEL: {{^}}fma_v4f32:
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+
+; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}},
+; EG-DAG: FMA {{\*? *}}[[RES]].X
+; EG-DAG: FMA {{\*? *}}[[RES]].Y
+; EG-DAG: FMA {{\*? *}}[[RES]].Z
+; EG-DAG: FMA {{\*? *}}[[RES]].W
 define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                        <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
-   %r0 = load <4 x float> addrspace(1)* %in1
-   %r1 = load <4 x float> addrspace(1)* %in2
-   %r2 = load <4 x float> addrspace(1)* %in3
-   %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
-   store <4 x float> %r3, <4 x float> addrspace(1)* %out
-   ret void
+  %r0 = load <4 x float> addrspace(1)* %in1
+  %r1 = load <4 x float> addrspace(1)* %in2
+  %r2 = load <4 x float> addrspace(1)* %in3
+  %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
+  store <4 x float> %r3, <4 x float> addrspace(1)* %out
+  ret void
 }
 
-; FUNC-LABEL: @fma_f64
-; SI: V_FMA_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2, double addrspace(1)* %in3) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
-   %r2 = load double addrspace(1)* %in3
-   %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2)
-   store double %r3, double addrspace(1)* %out
-   ret void
-}
+; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
+; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}}
+define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
 
-; FUNC-LABEL: @fma_v2f64
-; SI: V_FMA_F64
-; SI: V_FMA_F64
-define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
-                       <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
-   %r0 = load <2 x double> addrspace(1)* %in1
-   %r1 = load <2 x double> addrspace(1)* %in2
-   %r2 = load <2 x double> addrspace(1)* %in3
-   %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2)
-   store <2 x double> %r3, <2 x double> addrspace(1)* %out
-   ret void
+  %a = load float addrspace(1)* %in.a.gep, align 4
+  %b = load float addrspace(1)* %in.b.gep, align 4
+
+  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b)
+  store float %fma, float addrspace(1)* %out.gep, align 4
+  ret void
 }
 
-; FUNC-LABEL: @fma_v4f64
-; SI: V_FMA_F64
-; SI: V_FMA_F64
-; SI: V_FMA_F64
-; SI: V_FMA_F64
-define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
-                       <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
-   %r0 = load <4 x double> addrspace(1)* %in1
-   %r1 = load <4 x double> addrspace(1)* %in2
-   %r2 = load <4 x double> addrspace(1)* %in3
-   %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2)
-   store <4 x double> %r3, <4 x double> addrspace(1)* %out
-   ret void
+; FUNC-LABEL: @fma_commute_mul_s_f32
+define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %in.a.gep, align 4
+  %c = load float addrspace(1)* %in.b.gep, align 4
+
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  store float %fma, float addrspace(1)* %out.gep, align 4
+  ret void
 }
diff --git a/test/CodeGen/R600/fmax3.ll b/test/CodeGen/R600/fmax3.ll
new file mode 100644
index 0000000..cf371b3
--- /dev/null
+++ b/test/CodeGen/R600/fmax3.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.maxnum.f32(float, float) nounwind readnone
+
+; SI-LABEL: {{^}}test_fmax3_olt_0:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float addrspace(1)* %aptr, align 4
+  %b = load float addrspace(1)* %bptr, align 4
+  %c = load float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Commute operand of second fmax
+; SI-LABEL: {{^}}test_fmax3_olt_1:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float addrspace(1)* %aptr, align 4
+  %b = load float addrspace(1)* %bptr, align 4
+  %c = load float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/fmax_legacy.ll b/test/CodeGen/R600/fmax_legacy.ll
new file mode 100644
index 0000000..e9d837b
--- /dev/null
+++ b/test/CodeGen/R600/fmax_legacy.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmax_legacy_uge_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp uge float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_oge_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp oge float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ugt_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ugt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ogt_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ogt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmaxnum.f64.ll b/test/CodeGen/R600/fmaxnum.f64.ll
new file mode 100644
index 0000000..51cbf4d
--- /dev/null
+++ b/test/CodeGen/R600/fmaxnum.f64.ll
@@ -0,0 +1,75 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.maxnum.f64(double, double) #0
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) #0
+declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) #0
+declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>) #0
+declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0
+
+; FUNC-LABEL: @test_fmax_f64
+; SI: v_max_f64
+define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %val = call double @llvm.maxnum.f64(double %a, double %b) #0
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v2f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0
+  store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v4f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0
+  store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v8f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0
+  store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v16f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0
+  store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmaxnum.ll b/test/CodeGen/R600/fmaxnum.ll
new file mode 100644
index 0000000..01d30b0
--- /dev/null
+++ b/test/CodeGen/R600/fmaxnum.ll
@@ -0,0 +1,191 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.maxnum.f32(float, float) #0
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0
+declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0
+declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0
+
+declare double @llvm.maxnum.f64(double, double)
+
+; FUNC-LABEL: @test_fmax_f32
+; SI: v_max_f32_e32
+define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %val = call float @llvm.maxnum.f32(float %a, float %b) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v2f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+  %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
+  store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v4f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+  %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0
+  store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v8f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+  %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0
+  store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v16f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+  %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0
+  store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_nan_nan
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_val_nan
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_nan_val
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_p0_p0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_p0_n0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_n0_p0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_n0_n0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_var_immediate_f32
+; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_immediate_var_f32
+; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_var_literal_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_literal_var_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmin3.ll b/test/CodeGen/R600/fmin3.ll
new file mode 100644
index 0000000..7420368
--- /dev/null
+++ b/test/CodeGen/R600/fmin3.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.minnum.f32(float, float) nounwind readnone
+
+; SI-LABEL: {{^}}test_fmin3_olt_0:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float addrspace(1)* %aptr, align 4
+  %b = load float addrspace(1)* %bptr, align 4
+  %c = load float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Commute operand of second fmin
+; SI-LABEL: {{^}}test_fmin3_olt_1:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float addrspace(1)* %aptr, align 4
+  %b = load float addrspace(1)* %bptr, align 4
+  %c = load float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/fmin_legacy.ll b/test/CodeGen/R600/fmin_legacy.ll
new file mode 100644
index 0000000..2fbdb6b
--- /dev/null
+++ b/test/CodeGen/R600/fmin_legacy.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmin_legacy_f32
+; EG: MIN *
+; SI: v_min_legacy_f32_e32
+define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = fcmp uge float %r0, %r1
+   %r3 = select i1 %r2, float %r1, float %r0
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   store <4 x float> %vec, <4 x float> addrspace(1)* %out, align 16
+   ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ule_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ule float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ole_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ole float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_olt_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp olt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ult_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ult float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fminnum.f64.ll b/test/CodeGen/R600/fminnum.f64.ll
new file mode 100644
index 0000000..11b0c20
--- /dev/null
+++ b/test/CodeGen/R600/fminnum.f64.ll
@@ -0,0 +1,75 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.minnum.f64(double, double) #0
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0
+declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0
+declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0
+declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
+
+; FUNC-LABEL: @test_fmin_f64
+; SI: v_min_f64
+define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %val = call double @llvm.minnum.f64(double %a, double %b) #0
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v2f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0
+  store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v4f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0
+  store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v8f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0
+  store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v16f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0
+  store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fminnum.ll b/test/CodeGen/R600/fminnum.ll
new file mode 100644
index 0000000..65adab6
--- /dev/null
+++ b/test/CodeGen/R600/fminnum.ll
@@ -0,0 +1,189 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.minnum.f32(float, float) #0
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0
+declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0
+declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0
+
+; FUNC-LABEL: @test_fmin_f32
+; SI: v_min_f32_e32
+define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %val = call float @llvm.minnum.f32(float %a, float %b) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v2f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+  %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0
+  store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v4f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+  %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0
+  store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v8f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+  %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0
+  store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v16f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+  %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0
+  store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_nan_nan
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_val_nan
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_nan_val
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_p0_p0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_p0_n0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_n0_p0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_n0_n0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_var_immediate_f32
+; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float %a, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_immediate_var_f32
+; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float 2.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_var_literal_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float %a, float 99.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_literal_var_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float 99.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
index 2a7825f..eabb271 100644
--- a/test/CodeGen/R600/fmul.ll
+++ b/test/CodeGen/R600/fmul.ll
@@ -1,10 +1,11 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; R600-CHECK: @fmul_f32
-; R600-CHECK: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI-CHECK: @fmul_f32
-; SI-CHECK: V_MUL_F32
+
+; FUNC-LABEL: {{^}}fmul_f32:
+; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+
+; SI: v_mul_f32
 define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fmul float %a, %b
@@ -16,12 +17,12 @@ declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
-; R600-CHECK: @fmul_v2f32
-; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-; SI-CHECK: @fmul_v2f32
-; SI-CHECK: V_MUL_F32
-; SI-CHECK: V_MUL_F32
+; FUNC-LABEL: {{^}}fmul_v2f32:
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
+
+; SI: v_mul_f32
+; SI: v_mul_f32
 define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fmul <2 x float> %a, %b
@@ -29,16 +30,16 @@ entry:
   ret void
 }
 
-; R600-CHECK: @fmul_v4f32
-; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: @fmul_v4f32
-; SI-CHECK: V_MUL_F32
-; SI-CHECK: V_MUL_F32
-; SI-CHECK: V_MUL_F32
-; SI-CHECK: V_MUL_F32
+; FUNC-LABEL: {{^}}fmul_v4f32:
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_mul_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
 define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
@@ -47,3 +48,28 @@ define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}test_mul_2_k:
+; SI: v_mul_f32
+; SI-NOT: v_mul_f32
+; SI: s_endpgm
+define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
+  %y = fmul float %x, 2.0
+  %z = fmul float %y, 3.0
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_mul_2_k_inv:
+; SI: v_mul_f32
+; SI-NOT: v_mul_f32
+; SI-NOT: v_mad_f32
+; SI: s_endpgm
+define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
+  %y = fmul float %x, 3.0
+  %z = fmul float %y, 2.0
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/R600/fmul64.ll b/test/CodeGen/R600/fmul64.ll
index 7c7bf04..0a5f707 100644
--- a/test/CodeGen/R600/fmul64.ll
+++ b/test/CodeGen/R600/fmul64.ll
@@ -1,8 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-
-; CHECK: @fmul_f64
-; CHECK: V_MUL_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; RUN: llc -march=r600 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
 
+; FUNC-LABEL: {{^}}fmul_f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
@@ -11,3 +10,29 @@ define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
    store double %r2, double addrspace(1)* %out
    ret void
 }
+
+; FUNC-LABEL: {{^}}fmul_v2f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                        <2 x double> addrspace(1)* %in2) {
+   %r0 = load <2 x double> addrspace(1)* %in1
+   %r1 = load <2 x double> addrspace(1)* %in2
+   %r2 = fmul <2 x double> %r0, %r1
+   store <2 x double> %r2, <2 x double> addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fmul_v4f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+                        <4 x double> addrspace(1)* %in2) {
+   %r0 = load <4 x double> addrspace(1)* %in1
+   %r1 = load <4 x double> addrspace(1)* %in2
+   %r2 = fmul <4 x double> %r0, %r1
+   store <4 x double> %r2, <4 x double> addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/R600/fmuladd.ll b/test/CodeGen/R600/fmuladd.ll
index 48944f6..16003a5 100644
--- a/test/CodeGen/R600/fmuladd.ll
+++ b/test/CodeGen/R600/fmuladd.ll
@@ -1,7 +1,12 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
 
-; CHECK: @fmuladd_f32
-; CHECK: V_MAD_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+declare float @llvm.fmuladd.f32(float, float, float)
+declare double @llvm.fmuladd.f64(double, double, double)
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; CHECK-LABEL: {{^}}fmuladd_f32:
+; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                          float addrspace(1)* %in2, float addrspace(1)* %in3) {
@@ -13,10 +18,8 @@ define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
    ret void
 }
 
-declare float @llvm.fmuladd.f32(float, float, float)
-
-; CHECK: @fmuladd_f64
-; CHECK: V_FMA_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; CHECK-LABEL: {{^}}fmuladd_f64:
+; CHECK: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 
 define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                          double addrspace(1)* %in2, double addrspace(1)* %in3) {
@@ -28,4 +31,169 @@ define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
    ret void
 }
 
-declare double @llvm.fmuladd.f64(double, double, double)
+; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fadd_a_a_b_f32:
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fadd_a_a_b_f32(float addrspace(1)* %out,
+                            float addrspace(1)* %in1,
+                            float addrspace(1)* %in2) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r0 = load float addrspace(1)* %gep.0
+  %r1 = load float addrspace(1)* %gep.1
+
+  %add.0 = fadd float %r0, %r0
+  %add.1 = fadd float %add.0, %r1
+  store float %add.1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fadd_b_a_a_f32:
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fadd_b_a_a_f32(float addrspace(1)* %out,
+                            float addrspace(1)* %in1,
+                            float addrspace(1)* %in2) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r0 = load float addrspace(1)* %gep.0
+  %r1 = load float addrspace(1)* %gep.1
+
+  %add.0 = fadd float %r0, %r0
+  %add.1 = fadd float %r1, %add.0
+  store float %add.1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r1.fneg = fsub float -0.000000e+00, %r1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r1.fneg = fsub float -0.000000e+00, %r1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r2.fneg = fsub float -0.000000e+00, %r2
+
+  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
diff --git a/test/CodeGen/R600/fneg-fabs.f64.ll b/test/CodeGen/R600/fneg-fabs.f64.ll
new file mode 100644
index 0000000..555f4cc
--- /dev/null
+++ b/test/CodeGen/R600/fneg-fabs.f64.ll
@@ -0,0 +1,101 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FIXME: Check something here. Currently it seems fabs + fneg aren't
+; into 2 modifiers, although theoretically that should work.
+
+; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x7fffffff
+; SI: v_and_b32_e32 v[[FABS:[0-9]+]], {{s[0-9]+}}, [[IMMREG]]
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+}}:[[FABS]]{{\]}}
+define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
+  %fabs = call double @llvm.fabs.f64(double %x)
+  %fsub = fsub double -0.000000e+00, %fabs
+  %fadd = fadd double %y, %fsub
+  store double %fadd, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
+  %x = load double addrspace(1)* %xptr, align 8
+  %y = load double addrspace(1)* %xptr, align 8
+  %fabs = call double @llvm.fabs.f64(double %x)
+  %fsub = fsub double -0.000000e+00, %fabs
+  %fadd = fadd double %y, %fsub
+  store double %fadd, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fmul_f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|
+define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
+  %fabs = call double @llvm.fabs.f64(double %x)
+  %fsub = fsub double -0.000000e+00, %fabs
+  %fmul = fmul double %y, %fsub
+  store double %fmul, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_free_f64:
+define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc = bitcast i64 %in to double
+  %fabs = call double @llvm.fabs.f64(double %bc)
+  %fsub = fsub double -0.000000e+00, %fabs
+  store double %fsub, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc = bitcast i64 %in to double
+  %fabs = call double @fabs(double %bc)
+  %fsub = fsub double -0.000000e+00, %fabs
+  store double %fsub, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_f64:
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
+; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
+define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
+  %fabs = call double @llvm.fabs.f64(double %in)
+  %fsub = fsub double -0.000000e+00, %fabs
+  store double %fsub, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_v2f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
+  %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
+  store <2 x double> %fsub, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_v4f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
+  %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
+  store <4 x double> %fsub, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+declare double @fabs(double) readnone
+declare double @llvm.fabs.f64(double) readnone
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
+declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
diff --git a/test/CodeGen/R600/fneg-fabs.ll b/test/CodeGen/R600/fneg-fabs.ll
index d95e131..3cc832f 100644
--- a/test/CodeGen/R600/fneg-fabs.ll
+++ b/test/CodeGen/R600/fneg-fabs.ll
@@ -1,55 +1,117 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
+; SI-NOT: and
+; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
+define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %fsub = fsub float -0.000000e+00, %fabs
+  %fadd = fadd float %y, %fsub
+  store float %fadd, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
+; SI-NOT: and
+; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
+; SI-NOT: and
+define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %fsub = fsub float -0.000000e+00, %fabs
+  %fmul = fmul float %y, %fsub
+  store float %fmul, float addrspace(1)* %out, align 4
+  ret void
+}
 
 ; DAGCombiner will transform:
 ; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
 ; unless isFabsFree returns true
 
-; R600-CHECK-LABEL: @fneg_fabs_free
-; R600-CHECK-NOT: AND
-; R600-CHECK: |PV.{{[XYZW]}}|
-; R600-CHECK: -PV
-; SI-CHECK-LABEL: @fneg_fabs_free
-; SI-CHECK: V_OR_B32
-
-define void @fneg_fabs_free(float addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = bitcast i32 %in to float
-  %1 = call float @fabs(float %0)
-  %2 = fsub float -0.000000e+00, %1
-  store float %2, float addrspace(1)* %out
-  ret void
-}
-
-; R600-CHECK-LABEL: @fneg_fabs_v2
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600-CHECK: -PV
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600-CHECK: -PV
-; SI-CHECK-LABEL: @fneg_fabs_v2
-; SI-CHECK: V_OR_B32
-; SI-CHECK: V_OR_B32
-define void @fneg_fabs_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
-  %1 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %0
-  store <2 x float> %1, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; SI-CHECK-LABEL: @fneg_fabs_v4
-; SI-CHECK: V_OR_B32
-; SI-CHECK: V_OR_B32
-; SI-CHECK: V_OR_B32
-; SI-CHECK: V_OR_B32
-define void @fneg_fabs_v4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
-  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
-  store <4 x float> %1, <4 x float> addrspace(1)* %out
-  ret void
-}
-
-declare float @fabs(float ) readnone
-declare <2 x float> @llvm.fabs.v2f32(<2 x float> ) readnone
-declare <4 x float> @llvm.fabs.v4f32(<4 x float> ) readnone
+; FUNC-LABEL: {{^}}fneg_fabs_free_f32:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+; R600: -PV
+
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fabs = call float @llvm.fabs.f32(float %bc)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f32:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+; R600: -PV
+
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fabs = call float @fabs(float %bc)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_f32:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
+  %fabs = call float @llvm.fabs.f32(float %in)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_fneg_fabs_f32:
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %val = load float addrspace(1)* %in, align 4
+  %fabs = call float @llvm.fabs.f32(float %val)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_v2f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: -PV
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: -PV
+
+; FIXME: SGPR should be used directly for first src operand.
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
+  store <2 x float> %fsub, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: SGPR should be used directly for first src operand.
+; FUNC-LABEL: {{^}}fneg_fabs_v4f32:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+  %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
+  store <4 x float> %fsub, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @fabs(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/R600/fneg.f64.ll b/test/CodeGen/R600/fneg.f64.ll
new file mode 100644
index 0000000..7aa08a9
--- /dev/null
+++ b/test/CodeGen/R600/fneg.f64.ll
@@ -0,0 +1,59 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fneg_f64:
+; SI: v_xor_b32
+define void @fneg_f64(double addrspace(1)* %out, double %in) {
+  %fneg = fsub double -0.000000e+00, %in
+  store double %fneg, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_v2f64:
+; SI: v_xor_b32
+; SI: v_xor_b32
+define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
+  %fneg = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %in
+  store <2 x double> %fneg, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_v4f64:
+; R600: -PV
+; R600: -T
+; R600: -PV
+; R600: -PV
+
+; SI: v_xor_b32
+; SI: v_xor_b32
+; SI: v_xor_b32
+; SI: v_xor_b32
+define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
+  %fneg = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %in
+  store <4 x double> %fneg, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; DAGCombiner will transform:
+; (fneg (f64 bitcast (i64 a))) => (f64 bitcast (xor (i64 a), 0x80000000))
+; unless the target returns true for isNegFree()
+
+; FUNC-LABEL: {{^}}fneg_free_f64:
+; FIXME: Unnecessary copy to VGPRs
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]$}}
+define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc = bitcast i64 %in to double
+  %fsub = fsub double 0.0, %bc
+  store double %fsub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fneg_fold_f64:
+; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NOT: xor
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+define void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
+  %fsub = fsub double -0.0, %in
+  %fmul = fmul double %fsub, %in
+  store double %fmul, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fneg.ll b/test/CodeGen/R600/fneg.ll
index 4cddc73..c20cf24 100644
--- a/test/CodeGen/R600/fneg.ll
+++ b/test/CodeGen/R600/fneg.ll
@@ -1,44 +1,41 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; R600-CHECK-LABEL: @fneg
-; R600-CHECK: -PV
-; SI-CHECK-LABEL: @fneg
-; SI-CHECK: V_XOR_B32
-define void @fneg(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = fsub float -0.000000e+00, %in
-  store float %0, float addrspace(1)* %out
+; FUNC-LABEL: {{^}}fneg_f32:
+; R600: -PV
+
+; SI: v_xor_b32
+define void @fneg_f32(float addrspace(1)* %out, float %in) {
+  %fneg = fsub float -0.000000e+00, %in
+  store float %fneg, float addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK-LABEL: @fneg_v2
-; R600-CHECK: -PV
-; R600-CHECK: -PV
-; SI-CHECK-LABEL: @fneg_v2
-; SI-CHECK: V_XOR_B32
-; SI-CHECK: V_XOR_B32
-define void @fneg_v2(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
-entry:
-  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+; FUNC-LABEL: {{^}}fneg_v2f32:
+; R600: -PV
+; R600: -PV
+
+; SI: v_xor_b32
+; SI: v_xor_b32
+define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
+  %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
+  store <2 x float> %fneg, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK-LABEL: @fneg_v4
-; R600-CHECK: -PV
-; R600-CHECK: -T
-; R600-CHECK: -PV
-; R600-CHECK: -PV
-; SI-CHECK-LABEL: @fneg_v4
-; SI-CHECK: V_XOR_B32
-; SI-CHECK: V_XOR_B32
-; SI-CHECK: V_XOR_B32
-; SI-CHECK: V_XOR_B32
-define void @fneg_v4(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
-entry:
-  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
+; FUNC-LABEL: {{^}}fneg_v4f32:
+; R600: -PV
+; R600: -T
+; R600: -PV
+; R600: -PV
+
+; SI: v_xor_b32
+; SI: v_xor_b32
+; SI: v_xor_b32
+; SI: v_xor_b32
+define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
+  %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
+  store <4 x float> %fneg, <4 x float> addrspace(1)* %out
   ret void
 }
 
@@ -46,27 +43,26 @@ entry:
 ; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000))
 ; unless the target returns true for isNegFree()
 
-; R600-CHECK-LABEL: @fneg_free
-; R600-CHECK-NOT: XOR
-; R600-CHECK: -KC0[2].Z
-; SI-CHECK-LABEL: @fneg_free
-; XXX: We could use V_ADD_F32_e64 with the negate bit here instead.
-; SI-CHECK: V_SUB_F32_e64 v{{[0-9]}}, 0.000000e+00, s{{[0-9]}}, 0, 0
-define void @fneg_free(float addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = bitcast i32 %in to float
-  %1 = fsub float 0.0, %0
-  store float %1, float addrspace(1)* %out
+; FUNC-LABEL: {{^}}fneg_free_f32:
+; R600-NOT: XOR
+; R600: -KC0[2].Z
+
+; XXX: We could use v_add_f32_e64 with the negate bit here instead.
+; SI: v_sub_f32_e64 v{{[0-9]}}, 0.0, s{{[0-9]+$}}
+define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fsub = fsub float 0.0, %bc
+  store float %fsub, float addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @fneg_fold
-; SI-CHECK-NOT: V_XOR_B32
-; SI-CHECK: V_MUL_F32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
-define void @fneg_fold(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = fsub float -0.0, %in
-  %1 = fmul float %0, %in
-  store float %1, float addrspace(1)* %out
+; FUNC-LABEL: {{^}}fneg_fold_f32:
+; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; SI-NOT: xor
+; SI: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+define void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
+  %fsub = fsub float -0.0, %in
+  %fmul = fmul float %fsub, %in
+  store float %fmul, float addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/fp16_to_fp.ll b/test/CodeGen/R600/fp16_to_fp.ll
new file mode 100644
index 0000000..ec3e051
--- /dev/null
+++ b/test/CodeGen/R600/fp16_to_fp.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+
+; SI-LABEL: {{^}}test_convert_fp16_to_fp32:
+; SI: buffer_load_ushort [[VAL:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16 addrspace(1)* %in, align 2
+  %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; SI-LABEL: {{^}}test_convert_fp16_to_fp64:
+; SI: buffer_load_ushort [[VAL:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
+; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16 addrspace(1)* %in, align 2
+  %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
+  store double %cvt, double addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/fp16_to_fp32.ll b/test/CodeGen/R600/fp16_to_fp32.ll
deleted file mode 100644
index fa2e379..0000000
--- a/test/CodeGen/R600/fp16_to_fp32.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare i16 @llvm.convert.to.fp16(float) nounwind readnone
-
-; SI-LABEL: @test_convert_fp16_to_fp32:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]]
-; SI: V_CVT_F16_F32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-define void @test_convert_fp16_to_fp32(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %val = load float addrspace(1)* %in, align 4
-  %cvt = call i16 @llvm.convert.to.fp16(float %val) nounwind readnone
-  store i16 %cvt, i16 addrspace(1)* %out, align 2
-  ret void
-}
diff --git a/test/CodeGen/R600/fp32_to_fp16.ll b/test/CodeGen/R600/fp32_to_fp16.ll
index 9997cd3..e86ee62 100644
--- a/test/CodeGen/R600/fp32_to_fp16.ll
+++ b/test/CodeGen/R600/fp32_to_fp16.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare float @llvm.convert.from.fp16(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
 
-; SI-LABEL: @test_convert_fp16_to_fp32:
-; SI: BUFFER_LOAD_USHORT [[VAL:v[0-9]+]]
-; SI: V_CVT_F32_F16_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: BUFFER_STORE_DWORD [[RESULT]]
-define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16 addrspace(1)* %in, align 2
-  %cvt = call float @llvm.convert.from.fp16(i16 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
+; SI-LABEL: {{^}}test_convert_fp32_to_fp16:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_short [[RESULT]]
+define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %val = load float addrspace(1)* %in, align 4
+  %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
+  store i16 %cvt, i16 addrspace(1)* %out, align 2
   ret void
 }
diff --git a/test/CodeGen/R600/fp64_to_sint.ll b/test/CodeGen/R600/fp64_to_sint.ll
deleted file mode 100644
index 185e21c..0000000
--- a/test/CodeGen/R600/fp64_to_sint.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
-
-; CHECK: @fp64_to_sint
-; CHECK: V_CVT_I32_F64_e32
-define void @fp64_to_sint(i32 addrspace(1)* %out, double %in) {
-  %result = fptosi double %in to i32
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fp_to_sint.f64.ll b/test/CodeGen/R600/fp_to_sint.f64.ll
new file mode 100644
index 0000000..09edb40
--- /dev/null
+++ b/test/CodeGen/R600/fp_to_sint.f64.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @fp_to_sint_f64_i32
+; SI: v_cvt_i32_f64_e32
+define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
+  %result = fptosi double %in to i32
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_sint_v2f64_v2i32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+  %result = fptosi <2 x double> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_sint_v4f64_v4i32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+  %result = fptosi <4 x double> %in to <4 x i32>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_sint_i64_f64
+; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]]
+; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}}
+; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000
+
+; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}}
+; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]]
+
+; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000
+
+; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]]
+; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
+; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
+; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+  %val = load double addrspace(1)* %gep, align 8
+  %cast = fptosi double %val to i64
+  store i64 %cast, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index 8302b4f..c583ec3 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -1,31 +1,216 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
-
-; R600-CHECK: @fp_to_sint_v2i32
-; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; SI-CHECK: @fp_to_sint_v2i32
-; SI-CHECK: V_CVT_I32_F32_e32
-; SI-CHECK: V_CVT_I32_F32_e32
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}fp_to_sint_i32:
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_i32_f32_e32
+; SI: s_endpgm
+define void @fp_to_sint_i32 (i32 addrspace(1)* %out, float %in) {
+  %conv = fptosi float %in to i32
+  store i32 %conv, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_sint_v2i32:
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
 define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptosi <2 x float> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK: @fp_to_sint_v4i32
-; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; SI-CHECK: @fp_to_sint_v4i32
-; SI-CHECK: V_CVT_I32_F32_e32
-; SI-CHECK: V_CVT_I32_F32_e32
-; SI-CHECK: V_CVT_I32_F32_e32
-; SI-CHECK: V_CVT_I32_F32_e32
+; FUNC-LABEL: {{^}}fp_to_sint_v4i32:
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
 define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float> addrspace(1) * %in
   %result = fptosi <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}fp_to_sint_i64:
+
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; Check that the compiler doesn't crash with a "cannot select" error
+; SI: s_endpgm
+define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fptosi float %in to i64
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_sint_v2i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+  %conv = fptosi <2 x float> %x to <2 x i64>
+  store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_sint_v4i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+  %conv = fptosi <4 x float> %x to <4 x i64>
+  store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fp_to_sint_i64.ll b/test/CodeGen/R600/fp_to_sint_i64.ll
deleted file mode 100644
index ec3e198..0000000
--- a/test/CodeGen/R600/fp_to_sint_i64.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; FIXME: Merge into fp_to_sint.ll when EG/NI supports 64-bit types
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
-
-; SI-LABEL: @fp_to_sint_i64
-; Check that the compiler doesn't crash with a "cannot select" error
-; SI: S_ENDPGM
-define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fptosi float %in to i64
-  store i64 %0, i64 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fp_to_uint.f64.ll b/test/CodeGen/R600/fp_to_uint.f64.ll
index bf607ce..25859bb 100644
--- a/test/CodeGen/R600/fp_to_uint.f64.ll
+++ b/test/CodeGen/R600/fp_to_uint.f64.ll
@@ -1,9 +1,70 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
-; SI-LABEL: @fp_to_uint_i32_f64
-; SI: V_CVT_U32_F64_e32
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: {{^}}fp_to_uint_i32_f64:
+; SI: v_cvt_u32_f64_e32
 define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
   %cast = fptoui double %in to i32
   store i32 %cast, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; SI-LABEL: @fp_to_uint_v2i32_v2f64
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+  %cast = fptoui <2 x double> %in to <2 x i32>
+  store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v4i32_v4f64
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+  %cast = fptoui <4 x double> %in to <4 x i32>
+  store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_uint_i64_f64
+; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]]
+; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}}
+; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000
+
+; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}}
+; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]]
+
+; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000
+
+; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]]
+; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
+; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
+; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+  %val = load double addrspace(1)* %gep, align 8
+  %cast = fptoui double %val to i64
+  store i64 %cast, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v2i64_v2f64
+define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
+  %cast = fptoui <2 x double> %in to <2 x i64>
+  store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v4i64_v4f64
+define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
+  %cast = fptoui <4 x double> %in to <4 x i64>
+  store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index 77db43b..91bf4b7 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@@ -1,12 +1,21 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; R600-CHECK: @fp_to_uint_v2i32
-; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: @fp_to_uint_v2i32
-; SI-CHECK: V_CVT_U32_F32_e32
-; SI-CHECK: V_CVT_U32_F32_e32
+; FUNC-LABEL: {{^}}fp_to_uint_i32:
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_u32_f32_e32
+; SI: s_endpgm
+define void @fp_to_uint_i32 (i32 addrspace(1)* %out, float %in) {
+  %conv = fptoui float %in to i32
+  store i32 %conv, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_uint_v2i32:
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
 
 define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptoui <2 x float> %in to <2 x i32>
@@ -14,16 +23,15 @@ define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   ret void
 }
 
-; R600-CHECK: @fp_to_uint_v4i32
-; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; SI-CHECK: @fp_to_uint_v4i32
-; SI-CHECK: V_CVT_U32_F32_e32
-; SI-CHECK: V_CVT_U32_F32_e32
-; SI-CHECK: V_CVT_U32_F32_e32
-; SI-CHECK: V_CVT_U32_F32_e32
+; FUNC-LABEL: {{^}}fp_to_uint_v4i32:
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
 
 define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float> addrspace(1) * %in
@@ -31,3 +39,177 @@ define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspac
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; FUNC: {{^}}fp_to_uint_i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_uint_i64(i64 addrspace(1)* %out, float %x) {
+  %conv = fptoui float %x to i64
+  store i64 %conv, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_uint_v2i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_uint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+  %conv = fptoui <2 x float> %x to <2 x i64>
+  store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_uint_v4i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_uint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+  %conv = fptoui <4 x float> %x to <4 x i64>
+  store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fpext.ll b/test/CodeGen/R600/fpext.ll
index 143ee79..418395f 100644
--- a/test/CodeGen/R600/fpext.ll
+++ b/test/CodeGen/R600/fpext.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
 
-; CHECK: @fpext
-; CHECK: V_CVT_F64_F32_e32
+; CHECK: {{^}}fpext:
+; CHECK: v_cvt_f64_f32_e32
 define void @fpext(double addrspace(1)* %out, float %in) {
   %result = fpext float %in to double
   store double %result, double addrspace(1)* %out
diff --git a/test/CodeGen/R600/fptrunc.ll b/test/CodeGen/R600/fptrunc.ll
index 20a8c00..8ac8d3b 100644
--- a/test/CodeGen/R600/fptrunc.ll
+++ b/test/CodeGen/R600/fptrunc.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
 
-; CHECK: @fptrunc
-; CHECK: V_CVT_F32_F64_e32
+; CHECK: {{^}}fptrunc:
+; CHECK: v_cvt_f32_f64_e32
 define void @fptrunc(float addrspace(1)* %out, double %in) {
   %result = fptrunc double %in to float
   store float %result, float addrspace(1)* %out
diff --git a/test/CodeGen/R600/frem.ll b/test/CodeGen/R600/frem.ll
new file mode 100644
index 0000000..c846a77
--- /dev/null
+++ b/test/CodeGen/R600/frem.ll
@@ -0,0 +1,103 @@
+; RUN: llc -march=r600 -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}frem_f32:
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:0x10
+; SI-DAG: v_cmp
+; SI-DAG: v_mul_f32
+; SI: v_rcp_f32_e32
+; SI: v_mul_f32_e32
+; SI: v_mul_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_mad_f32
+; SI: s_endpgm
+define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                      float addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr float addrspace(1)* %in2, i32 4
+   %r0 = load float addrspace(1)* %in1, align 4
+   %r1 = load float addrspace(1)* %gep2, align 4
+   %r2 = frem float %r0, %r1
+   store float %r2, float addrspace(1)* %out, align 4
+   ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_frem_f32:
+; SI: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:0x10
+; SI: buffer_load_dword [[X:v[0-9]+]], {{.*}}
+; SI: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
+; SI: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
+; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                             float addrspace(1)* %in2) #1 {
+   %gep2 = getelementptr float addrspace(1)* %in2, i32 4
+   %r0 = load float addrspace(1)* %in1, align 4
+   %r1 = load float addrspace(1)* %gep2, align 4
+   %r2 = frem float %r0, %r1
+   store float %r2, float addrspace(1)* %out, align 4
+   ret void
+}
+
+; TODO: This should check something when f64 fdiv is implemented
+; correctly
+
+; FUNC-LABEL: {{^}}frem_f64:
+; SI: s_endpgm
+define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) #0 {
+   %r0 = load double addrspace(1)* %in1, align 8
+   %r1 = load double addrspace(1)* %in2, align 8
+   %r2 = frem double %r0, %r1
+   store double %r2, double addrspace(1)* %out, align 8
+   ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_frem_f64:
+; SI: v_rcp_f64_e32
+; SI: v_mul_f64
+; SI: v_bfe_u32
+; SI: v_fma_f64
+; SI: s_endpgm
+define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                             double addrspace(1)* %in2) #1 {
+   %r0 = load double addrspace(1)* %in1, align 8
+   %r1 = load double addrspace(1)* %in2, align 8
+   %r2 = frem double %r0, %r1
+   store double %r2, double addrspace(1)* %out, align 8
+   ret void
+}
+
+define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+                        <2 x float> addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr <2 x float> addrspace(1)* %in2, i32 4
+   %r0 = load <2 x float> addrspace(1)* %in1, align 8
+   %r1 = load <2 x float> addrspace(1)* %gep2, align 8
+   %r2 = frem <2 x float> %r0, %r1
+   store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
+   ret void
+}
+
+define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+                        <4 x float> addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr <4 x float> addrspace(1)* %in2, i32 4
+   %r0 = load <4 x float> addrspace(1)* %in1, align 16
+   %r1 = load <4 x float> addrspace(1)* %gep2, align 16
+   %r2 = frem <4 x float> %r0, %r1
+   store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
+   ret void
+}
+
+define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                        <2 x double> addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr <2 x double> addrspace(1)* %in2, i32 4
+   %r0 = load <2 x double> addrspace(1)* %in1, align 16
+   %r1 = load <2 x double> addrspace(1)* %gep2, align 16
+   %r2 = frem <2 x double> %r0, %r1
+   store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
+   ret void
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="false" }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/R600/fsqrt.ll b/test/CodeGen/R600/fsqrt.ll
index ae50b17..1f91faf 100644
--- a/test/CodeGen/R600/fsqrt.ll
+++ b/test/CodeGen/R600/fsqrt.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
-; CHECK: @fsqrt_f32
-; CHECK: V_SQRT_F32_e32 {{v[0-9]+, v[0-9]+}}
+; CHECK: {{^}}fsqrt_f32:
+; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
 
 define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
    %r0 = load float addrspace(1)* %in
@@ -10,8 +10,8 @@ define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
    ret void
 }
 
-; CHECK: @fsqrt_f64
-; CHECK: V_SQRT_F64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; CHECK: {{^}}fsqrt_f64:
+; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 
 define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r0 = load double addrspace(1)* %in
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
index 4f74efb..6e5ccf1 100644
--- a/test/CodeGen/R600/fsub.ll
+++ b/test/CodeGen/R600/fsub.ll
@@ -1,14 +1,25 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
-
-; R600-CHECK: @fsub_f32
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
-; SI-CHECK: @fsub_f32
-; SI-CHECK: V_SUB_F32
-define void @fsub_f32(float addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fsub float %a, %b
-  store float %0, float addrspace(1)* %out
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}v_fsub_f32:
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %b_ptr = getelementptr float addrspace(1)* %in, i32 1
+  %a = load float addrspace(1)* %in, align 4
+  %b = load float addrspace(1)* %b_ptr, align 4
+  %result = fsub float %a, %b
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_fsub_f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
+
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
+  %sub = fsub float %a, %b
+  store float %sub, float addrspace(1)* %out, align 4
   ret void
 }
 
@@ -16,34 +27,48 @@ declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
-; R600-CHECK: @fsub_v2f32
-; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
-; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
-; SI-CHECK: @fsub_v2f32
-; SI-CHECK: V_SUB_F32
-; SI-CHECK: V_SUB_F32
+; FUNC-LABEL: {{^}}fsub_v2f32:
+; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
+; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
+
+; FIXME: Should be using SGPR directly for first operand
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
-entry:
-  %0 = fsub <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %sub = fsub <2 x float> %a, %b
+  store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8
   ret void
 }
 
-; R600-CHECK: @fsub_v4f32
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; SI-CHECK: @fsub_v4f32
-; SI-CHECK: V_SUB_F32
-; SI-CHECK: V_SUB_F32
-; SI-CHECK: V_SUB_F32
-; SI-CHECK: V_SUB_F32
-define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; FUNC-LABEL: {{^}}v_fsub_v4f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float> addrspace(1) * %in
-  %b = load <4 x float> addrspace(1) * %b_ptr
+  %a = load <4 x float> addrspace(1)* %in, align 16
+  %b = load <4 x float> addrspace(1)* %b_ptr, align 16
+  %result = fsub <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FIXME: Should be using SGPR directly for first operand
+
+; FUNC-LABEL: {{^}}s_fsub_v4f32:
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: s_endpgm
+define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
   %result = fsub <4 x float> %a, %b
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
diff --git a/test/CodeGen/R600/fsub64.ll b/test/CodeGen/R600/fsub64.ll
index f5e5708..eca1b62 100644
--- a/test/CodeGen/R600/fsub64.ll
+++ b/test/CodeGen/R600/fsub64.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @fsub_f64:
-; SI: V_ADD_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI-LABEL: {{^}}fsub_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
diff --git a/test/CodeGen/R600/ftrunc.f64.ll b/test/CodeGen/R600/ftrunc.f64.ll
new file mode 100644
index 0000000..fba6154
--- /dev/null
+++ b/test/CodeGen/R600/ftrunc.f64.ll
@@ -0,0 +1,110 @@
+; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.trunc.f64(double) nounwind readnone
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.trunc.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
+
+; FUNC-LABEL: {{^}}v_ftrunc_f64:
+; CI: v_trunc_f64
+; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11
+; SI: s_endpgm
+define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+  %x = load double addrspace(1)* %in, align 8
+  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_f64:
+; CI: v_trunc_f64_e32
+
+; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: s_lshr_b64
+; SI: s_not_b64
+; SI: s_and_b64
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: cmp_lt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_gt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: s_endpgm
+define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v2f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f64:
+; FIXME-CI: v_trunc_f64_e32
+; FIXME-CI: v_trunc_f64_e32
+; FIXME-CI: v_trunc_f64_e32
+; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}ftrunc_v4f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v8f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v16f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/ftrunc.ll b/test/CodeGen/R600/ftrunc.ll
index 0d7d467..0eb1d7d 100644
--- a/test/CodeGen/R600/ftrunc.ll
+++ b/test/CodeGen/R600/ftrunc.ll
@@ -8,55 +8,55 @@ declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
 declare <8 x float> @llvm.trunc.v8f32(<8 x float>) nounwind readnone
 declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone
 
-; FUNC-LABEL: @ftrunc_f32:
+; FUNC-LABEL: {{^}}ftrunc_f32:
 ; EG: TRUNC
-; SI: V_TRUNC_F32_e32
+; SI: v_trunc_f32_e32
 define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
   %y = call float @llvm.trunc.f32(float %x) nounwind readnone
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @ftrunc_v2f32:
+; FUNC-LABEL: {{^}}ftrunc_v2f32:
 ; EG: TRUNC
 ; EG: TRUNC
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
 define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone
   store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FIXME-FUNC-LABEL: @ftrunc_v3f32:
+; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f32:
 ; FIXME-EG: TRUNC
 ; FIXME-EG: TRUNC
 ; FIXME-EG: TRUNC
-; FIXME-SI: V_TRUNC_F32_e32
-; FIXME-SI: V_TRUNC_F32_e32
-; FIXME-SI: V_TRUNC_F32_e32
+; FIXME-SI: v_trunc_f32_e32
+; FIXME-SI: v_trunc_f32_e32
+; FIXME-SI: v_trunc_f32_e32
 ; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
 ;   %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone
 ;   store <3 x float> %y, <3 x float> addrspace(1)* %out
 ;   ret void
 ; }
 
-; FUNC-LABEL: @ftrunc_v4f32:
+; FUNC-LABEL: {{^}}ftrunc_v4f32:
 ; EG: TRUNC
 ; EG: TRUNC
 ; EG: TRUNC
 ; EG: TRUNC
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
 define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone
   store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @ftrunc_v8f32:
+; FUNC-LABEL: {{^}}ftrunc_v8f32:
 ; EG: TRUNC
 ; EG: TRUNC
 ; EG: TRUNC
@@ -65,21 +65,21 @@ define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
 ; EG: TRUNC
 ; EG: TRUNC
 ; EG: TRUNC
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
 define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone
   store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @ftrunc_v16f32:
+; FUNC-LABEL: {{^}}ftrunc_v16f32:
 ; EG: TRUNC
 ; EG: TRUNC
 ; EG: TRUNC
@@ -96,22 +96,22 @@ define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
 ; EG: TRUNC
 ; EG: TRUNC
 ; EG: TRUNC
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
 define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone
   store <16 x float> %y, <16 x float> addrspace(1)* %out
diff --git a/test/CodeGen/R600/gep-address-space.ll b/test/CodeGen/R600/gep-address-space.ll
index ab2c0bf..036daaf 100644
--- a/test/CodeGen/R600/gep-address-space.ll
+++ b/test/CodeGen/R600/gep-address-space.ll
@@ -1,29 +1,33 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 
 define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
-; CHECK-LABEL: @use_gep_address_space:
-; CHECK: V_MOV_B32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
-; CHECK: DS_WRITE_B32 [[PTR]], v{{[0-9]+}}, 0x40
+; CHECK-LABEL: {{^}}use_gep_address_space:
+; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
+; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64
   %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16
   store i32 99, i32 addrspace(3)* %p
   ret void
 }
 
 define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
-; CHECK-LABEL: @use_gep_address_space_large_offset:
-; CHECK: S_ADD_I32
-; CHECK: DS_WRITE_B32
+; CHECK-LABEL: {{^}}use_gep_address_space_large_offset:
+; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
+; SI, which is why it is being OR'd with the base pointer.
+; SI: s_or_b32
+; CI: s_add_i32
+; CHECK: ds_write_b32
   %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
   store i32 99, i32 addrspace(3)* %p
   ret void
 }
 
 define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
-; CHECK-LABEL: @gep_as_vector_v4:
-; CHECK: S_ADD_I32
-; CHECK: S_ADD_I32
-; CHECK: S_ADD_I32
-; CHECK: S_ADD_I32
+; CHECK-LABEL: {{^}}gep_as_vector_v4:
+; CHECK: s_add_i32
+; CHECK: s_add_i32
+; CHECK: s_add_i32
+; CHECK: s_add_i32
   %p = getelementptr <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
@@ -37,9 +41,9 @@ define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind
 }
 
 define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
-; CHECK-LABEL: @gep_as_vector_v2:
-; CHECK: S_ADD_I32
-; CHECK: S_ADD_I32
+; CHECK-LABEL: {{^}}gep_as_vector_v2:
+; CHECK: s_add_i32
+; CHECK: s_add_i32
   %p = getelementptr <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
   %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
diff --git a/test/CodeGen/R600/global-directive.ll b/test/CodeGen/R600/global-directive.ll
new file mode 100644
index 0000000..d1244b8
--- /dev/null
+++ b/test/CodeGen/R600/global-directive.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; Make sure the GlobalDirective isn't merged with the function name
+
+; SI:	.globl	foo
+; SI: {{^}}foo:
+define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/global-zero-initializer.ll b/test/CodeGen/R600/global-zero-initializer.ll
new file mode 100644
index 0000000..b69b061
--- /dev/null
+++ b/test/CodeGen/R600/global-zero-initializer.ll
@@ -0,0 +1,12 @@
+; RUN: not llc -march=r600 -mcpu=SI < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported initializer for address space in load_init_global_global
+
+@lds = addrspace(1) global [256 x i32] zeroinitializer
+
+define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) {
+ %gep = getelementptr [256 x i32] addrspace(1)* @lds, i32 0, i32 10
+  %ld = load i32 addrspace(1)* %gep
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/global_atomics.ll b/test/CodeGen/R600/global_atomics.ll
new file mode 100644
index 0000000..533a964
--- /dev/null
+++ b/test/CodeGen/R600/global_atomics.ll
@@ -0,0 +1,801 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}atomic_add_i32_offset:
+; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
+; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32:
+; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
+; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_offset:
+; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
+; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32:
+; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
+; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
+; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
+; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32:
+; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
+; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_offset:
+; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
+; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32:
+; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
+; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
+; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
+; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32:
+; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
+; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_offset:
+; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
+; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32:
+; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
+; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
+; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
+; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32:
+; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
+; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_offset:
+; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
+; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32:
+; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
+; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
+; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
+; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32:
+; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
+; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
+; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
+; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32:
+; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
+; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
diff --git a/test/CodeGen/R600/gv-const-addrspace-fail.ll b/test/CodeGen/R600/gv-const-addrspace-fail.ll
index ebd7811..905948f 100644
--- a/test/CodeGen/R600/gv-const-addrspace-fail.ll
+++ b/test/CodeGen/R600/gv-const-addrspace-fail.ll
@@ -1,14 +1,13 @@
-; XFAIL: *
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
 @a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1
 
-; FUNC-LABEL: @test_i8
+; FUNC-LABEL: {{^}}test_i8:
 ; EG: CF_END
-; SI: BUFFER_STORE_BYTE
-; SI: S_ENDPGM
+; SI: buffer_store_byte
+; SI: s_endpgm
 define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
   %arrayidx = getelementptr inbounds [1 x i8] addrspace(2)* @a, i32 0, i32 %s
   %1 = load i8 addrspace(2)* %arrayidx, align 1
@@ -18,10 +17,10 @@ define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
 
 @b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
 
-; FUNC-LABEL: @test_i16
+; FUNC-LABEL: {{^}}test_i16:
 ; EG: CF_END
-; SI: BUFFER_STORE_SHORT
-; SI: S_ENDPGM
+; SI: buffer_store_short
+; SI: s_endpgm
 define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
   %arrayidx = getelementptr inbounds [1 x i16] addrspace(2)* @b, i32 0, i32 %s
   %1 = load i16 addrspace(2)* %arrayidx, align 2
@@ -32,9 +31,9 @@ define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
 %struct.bar = type { float, [5 x i8] }
 
 ; The illegal i8s aren't handled
-@struct_bar_gv = internal addrspace(2) unnamed_addr constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ]
+@struct_bar_gv = internal addrspace(2) constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ]
 
-; FUNC-LABEL: @struct_bar_gv_load
+; FUNC-LABEL: {{^}}struct_bar_gv_load:
 define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index
   %load = load i8 addrspace(2)* %gep, align 1
@@ -49,7 +48,7 @@ define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
                                                                     <4 x i32> <i32 9, i32 10, i32 11, i32 12>,
                                                                     <4 x i32> <i32 13, i32 14, i32 15, i32 16> ]
 
-; FUNC-LABEL: @array_vector_gv_load
+; FUNC-LABEL: {{^}}array_vector_gv_load:
 define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index
   %load = load <4 x i32> addrspace(2)* %gep, align 16
diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll
index db64a6f..6aa20b8 100644
--- a/test/CodeGen/R600/gv-const-addrspace.ll
+++ b/test/CodeGen/R600/gv-const-addrspace.ll
@@ -4,11 +4,11 @@
 
 @b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
 
-; XXX: Test on SI once 64-bit adds are supportes.
-
 @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
 
-; FUNC-LABEL: @float
+; FUNC-LABEL: {{^}}float:
+; FIXME: We should be using s_load_dword here.
+; SI: buffer_load_dword
 
 ; EG-DAG: MOV {{\** *}}T2.X
 ; EG-DAG: MOV {{\** *}}T3.X
@@ -27,7 +27,10 @@ entry:
 
 @i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
 
-; FUNC-LABEL: @i32
+; FUNC-LABEL: {{^}}i32:
+
+; FIXME: We should be using s_load_dword here.
+; SI: buffer_load_dword
 
 ; EG-DAG: MOV {{\** *}}T2.X
 ; EG-DAG: MOV {{\** *}}T3.X
@@ -49,7 +52,8 @@ entry:
 
 @struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
 
-; FUNC-LABEL: @struct_foo_gv_load
+; FUNC-LABEL: {{^}}struct_foo_gv_load:
+; SI: s_load_dword
 
 define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
@@ -63,10 +67,31 @@ define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
                                                                 <1 x i32> <i32 3>,
                                                                 <1 x i32> <i32 4> ]
 
-; FUNC-LABEL: @array_v1_gv_load
+; FUNC-LABEL: {{^}}array_v1_gv_load:
+; FIXME: We should be using s_load_dword here.
+; SI: buffer_load_dword
 define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
   %load = load <1 x i32> addrspace(2)* %gep, align 4
   store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
   ret void
 }
+
+define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
+entry:
+  %0 = icmp eq i32 0, %a
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = getelementptr inbounds [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+  %2 = load float addrspace(2)* %1
+  store float %2, float addrspace(1)* %out
+  br label %endif
+
+else:
+  store float 1.0, float addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}
diff --git a/test/CodeGen/R600/half.ll b/test/CodeGen/R600/half.ll
new file mode 100644
index 0000000..6ad9b2f
--- /dev/null
+++ b/test/CodeGen/R600/half.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+
+define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: {{^}}test_load_store:
+; CHECK: buffer_load_ushort [[TMP:v[0-9]+]]
+; CHECK: buffer_store_short [[TMP]]
+  %val = load half addrspace(1)* %in
+  store half %val, half addrspace(1) * %out
+  ret void
+}
+
+define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) {
+; CHECK-LABEL: {{^}}test_bitcast_from_half:
+; CHECK: buffer_load_ushort [[TMP:v[0-9]+]]
+; CHECK: buffer_store_short [[TMP]]
+  %val = load half addrspace(1) * %in
+  %val_int = bitcast half %val to i16
+  store i16 %val_int, i16 addrspace(1)* %out
+  ret void
+}
+
+define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+; CHECK-LABEL: {{^}}test_bitcast_to_half:
+; CHECK: buffer_load_ushort [[TMP:v[0-9]+]]
+; CHECK: buffer_store_short [[TMP]]
+  %val = load i16 addrspace(1)* %in
+  %val_fp = bitcast i16 %val to half
+  store half %val_fp, half addrspace(1)* %out
+  ret void
+}
+
+define void @test_extend32(half addrspace(1)* %in, float addrspace(1)* %out) {
+; CHECK-LABEL: {{^}}test_extend32:
+; CHECK: v_cvt_f32_f16_e32
+
+  %val16 = load half addrspace(1)* %in
+  %val32 = fpext half %val16 to float
+  store float %val32, float addrspace(1)* %out
+  ret void
+}
+
+define void @test_extend64(half addrspace(1)* %in, double addrspace(1)* %out) {
+; CHECK-LABEL: {{^}}test_extend64:
+; CHECK: v_cvt_f32_f16_e32
+; CHECK: v_cvt_f64_f32_e32
+
+  %val16 = load half addrspace(1)* %in
+  %val64 = fpext half %val16 to double
+  store double %val64, double addrspace(1)* %out
+  ret void
+}
+
+define void @test_trunc32(float addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: {{^}}test_trunc32:
+; CHECK: v_cvt_f16_f32_e32
+
+  %val32 = load float addrspace(1)* %in
+  %val16 = fptrunc float %val32 to half
+  store half %val16, half addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/i1-copy-implicit-def.ll b/test/CodeGen/R600/i1-copy-implicit-def.ll
new file mode 100644
index 0000000..7c5bc04
--- /dev/null
+++ b/test/CodeGen/R600/i1-copy-implicit-def.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SILowerI1Copies was not handling IMPLICIT_DEF
+; SI-LABEL: {{^}}br_implicit_def:
+; SI: BB#0:
+; SI-NEXT: s_and_saveexec_b64
+; SI-NEXT: s_xor_b64
+; SI-NEXT: BB#1:
+define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
+bb:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  store volatile i32 123, i32 addrspace(1)* %out
+  ret void
+
+bb2:
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/R600/i1-copy-phi.ll b/test/CodeGen/R600/i1-copy-phi.ll
new file mode 100644
index 0000000..bfa8672
--- /dev/null
+++ b/test/CodeGen/R600/i1-copy-phi.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}br_i1_phi:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; SI: s_and_saveexec_b64
+; SI: s_xor_b64
+; SI: v_mov_b32_e32 [[REG]], -1{{$}}
+; SI: v_cmp_ne_i32_e64 {{s\[[0-9]+:[0-9]+\]}}, [[REG]], 0
+; SI: s_and_saveexec_b64
+; SI: s_xor_b64
+; SI: s_endpgm
+define void @br_i1_phi(i32 %arg, i1 %arg1) #0 {
+bb:
+  br i1 %arg1, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  %tmp = phi i1 [ true, %bb2 ], [ false, %bb ]
+  br i1 %tmp, label %bb4, label %bb6
+
+bb4:                                              ; preds = %bb3
+  %tmp5 = mul i32 undef, %arg
+  br label %bb6
+
+bb6:                                              ; preds = %bb4, %bb3
+  ret void
+}
diff --git a/test/CodeGen/R600/icmp64.ll b/test/CodeGen/R600/icmp64.ll
index c9e62ff..870bf7f 100644
--- a/test/CodeGen/R600/icmp64.ll
+++ b/test/CodeGen/R600/icmp64.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @test_i64_eq:
-; SI: V_CMP_EQ_I64
+; SI-LABEL: {{^}}test_i64_eq:
+; SI: v_cmp_eq_i64
 define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp eq i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -9,8 +9,8 @@ define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: @test_i64_ne:
-; SI: V_CMP_NE_I64
+; SI-LABEL: {{^}}test_i64_ne:
+; SI: v_cmp_ne_i64
 define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ne i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -18,8 +18,8 @@ define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: @test_i64_slt:
-; SI: V_CMP_LT_I64
+; SI-LABEL: {{^}}test_i64_slt:
+; SI: v_cmp_lt_i64
 define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp slt i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -27,8 +27,8 @@ define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: @test_i64_ult:
-; SI: V_CMP_LT_U64
+; SI-LABEL: {{^}}test_i64_ult:
+; SI: v_cmp_lt_u64
 define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ult i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -36,8 +36,8 @@ define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: @test_i64_sle:
-; SI: V_CMP_LE_I64
+; SI-LABEL: {{^}}test_i64_sle:
+; SI: v_cmp_le_i64
 define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sle i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -45,8 +45,8 @@ define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: @test_i64_ule:
-; SI: V_CMP_LE_U64
+; SI-LABEL: {{^}}test_i64_ule:
+; SI: v_cmp_le_u64
 define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ule i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -54,8 +54,8 @@ define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: @test_i64_sgt:
-; SI: V_CMP_GT_I64
+; SI-LABEL: {{^}}test_i64_sgt:
+; SI: v_cmp_gt_i64
 define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sgt i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -63,8 +63,8 @@ define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: @test_i64_ugt:
-; SI: V_CMP_GT_U64
+; SI-LABEL: {{^}}test_i64_ugt:
+; SI: v_cmp_gt_u64
 define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -72,8 +72,8 @@ define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: @test_i64_sge:
-; SI: V_CMP_GE_I64
+; SI-LABEL: {{^}}test_i64_sge:
+; SI: v_cmp_ge_i64
 define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sge i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -81,8 +81,8 @@ define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: @test_i64_uge:
-; SI: V_CMP_GE_U64
+; SI-LABEL: {{^}}test_i64_uge:
+; SI: v_cmp_ge_u64
 define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp uge i64 %a, %b
   %result = sext i1 %cmp to i32
diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll
index b047315..1fcaf29 100644
--- a/test/CodeGen/R600/imm.ll
+++ b/test/CodeGen/R600/imm.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
 
 ; Use a 64-bit value with lo bits that can be represented as an inline constant
-; CHECK: @i64_imm_inline_lo
-; CHECK: S_MOV_B32 [[LO:s[0-9]+]], 5
-; CHECK: V_MOV_B32_e32 v[[LO_VGPR:[0-9]+]], [[LO]]
-; CHECK: BUFFER_STORE_DWORDX2 v{{\[}}[[LO_VGPR]]:
+; CHECK-LABEL: {{^}}i64_imm_inline_lo:
+; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5
+; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]:
 define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
 entry:
   store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
@@ -12,12 +12,204 @@ entry:
 }
 
 ; Use a 64-bit value with hi bits that can be represented as an inline constant
-; CHECK: @i64_imm_inline_hi
-; CHECK: S_MOV_B32 [[HI:s[0-9]+]], 5
-; CHECK: V_MOV_B32_e32 v[[HI_VGPR:[0-9]+]], [[HI]]
-; CHECK: BUFFER_STORE_DWORDX2 v{{\[[0-9]+:}}[[HI_VGPR]]
+; CHECK-LABEL: {{^}}i64_imm_inline_hi:
+; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5
+; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]]
+; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]]
 define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
 entry:
   store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
   ret void
 }
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
+  store float 0.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
+  store float 0.5, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
+  store float -0.5, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
+  store float 1.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
+  store float -1.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
+  store float 2.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
+  store float -2.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
+  store float 4.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
+  store float -4.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_literal_imm_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_literal_imm_f32(float addrspace(1)* %out) {
+  store float 4096.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0.5
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -0.5
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 1.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -1.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 2.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -2.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 4.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -4.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @commute_add_inline_imm_0.5_f32
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %x = load float addrspace(1)* %in
+  %y = fadd float %x, 0.5
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @commute_add_literal_f32
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]]
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %x = load float addrspace(1)* %in
+  %y = fadd float %x, 1024.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/indirect-addressing-si.ll b/test/CodeGen/R600/indirect-addressing-si.ll
index 169d69b..0ba1614 100644
--- a/test/CodeGen/R600/indirect-addressing-si.ll
+++ b/test/CodeGen/R600/indirect-addressing-si.ll
@@ -4,8 +4,8 @@
 ; indexing of vectors.
 
 ; CHECK: extract_w_offset
-; CHECK: S_MOV_B32 m0
-; CHECK-NEXT: V_MOVRELS_B32_e32
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movrels_b32_e32
 define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 %in, 1
@@ -15,8 +15,8 @@ entry:
 }
 
 ; CHECK: extract_wo_offset
-; CHECK: S_MOV_B32 m0
-; CHECK-NEXT: V_MOVRELS_B32_e32
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movrels_b32_e32
 define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
@@ -25,8 +25,8 @@ entry:
 }
 
 ; CHECK: insert_w_offset
-; CHECK: S_MOV_B32 m0
-; CHECK-NEXT: V_MOVRELD_B32_e32
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movreld_b32_e32
 define void @insert_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 %in, 1
@@ -37,8 +37,8 @@ entry:
 }
 
 ; CHECK: insert_wo_offset
-; CHECK: S_MOV_B32 m0
-; CHECK-NEXT: V_MOVRELD_B32_e32
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movreld_b32_e32
 define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll
index b127b7e..e0a6ce1 100644
--- a/test/CodeGen/R600/indirect-private-64.ll
+++ b/test/CodeGen/R600/indirect-private-64.ll
@@ -1,10 +1,16 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+
 
 declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
 
-; SI-LABEL: @private_access_f64_alloca:
-; SI: DS_WRITE_B64
-; SI: DS_READ_B64
+; SI-LABEL: {{^}}private_access_f64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx2
+; SI-ALLOCA: buffer_load_dwordx2
+
+; SI-PROMOTE: ds_write_b64
+; SI-PROMOTE: ds_read_b64
 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load double addrspace(1)* %in, align 8
   %array = alloca double, i32 16, align 8
@@ -16,11 +22,19 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
   ret void
 }
 
-; SI-LABEL: @private_access_v2f64_alloca:
-; SI: DS_WRITE_B64
-; SI: DS_WRITE_B64
-; SI: DS_READ_B64
-; SI: DS_READ_B64
+; SI-LABEL: {{^}}private_access_v2f64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx4
+; SI-ALLOCA: buffer_load_dwordx4
+
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load <2 x double> addrspace(1)* %in, align 16
   %array = alloca <2 x double>, i32 16, align 16
@@ -32,9 +46,13 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
   ret void
 }
 
-; SI-LABEL: @private_access_i64_alloca:
-; SI: DS_WRITE_B64
-; SI: DS_READ_B64
+; SI-LABEL: {{^}}private_access_i64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx2
+; SI-ALLOCA: buffer_load_dwordx2
+
+; SI-PROMOTE: ds_write_b64
+; SI-PROMOTE: ds_read_b64
 define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load i64 addrspace(1)* %in, align 8
   %array = alloca i64, i32 16, align 8
@@ -46,11 +64,19 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
   ret void
 }
 
-; SI-LABEL: @private_access_v2i64_alloca:
-; SI: DS_WRITE_B64
-; SI: DS_WRITE_B64
-; SI: DS_READ_B64
-; SI: DS_READ_B64
+; SI-LABEL: {{^}}private_access_v2i64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx4
+; SI-ALLOCA: buffer_load_dwordx4
+
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load <2 x i64> addrspace(1)* %in, align 16
   %array = alloca <2 x i64>, i32 16, align 16
diff --git a/test/CodeGen/R600/infinite-loop.ll b/test/CodeGen/R600/infinite-loop.ll
index 68ffaae..48edab0 100644
--- a/test/CodeGen/R600/infinite-loop.ll
+++ b/test/CodeGen/R600/infinite-loop.ll
@@ -1,11 +1,11 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @infinite_loop:
-; SI: V_MOV_B32_e32 [[REG:v[0-9]+]], 0x3e7
+; SI-LABEL: {{^}}infinite_loop:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
 ; SI: BB0_1:
-; SI: BUFFER_STORE_DWORD [[REG]]
-; SI: S_WAITCNT vmcnt(0) expcnt(0)
-; SI: S_BRANCH BB0_1
+; SI: buffer_store_dword [[REG]]
+; SI: s_waitcnt vmcnt(0) expcnt(0)
+; SI: s_branch BB0_1
 define void @infinite_loop(i32 addrspace(1)* %out) {
 entry:
   br label %for.body
diff --git a/test/CodeGen/R600/inline-calls.ll b/test/CodeGen/R600/inline-calls.ll
new file mode 100644
index 0000000..3bceeca
--- /dev/null
+++ b/test/CodeGen/R600/inline-calls.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck  %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-NOT: {{^}}func:
+define internal fastcc i32 @func(i32 %a) {
+entry:
+  %tmp0 = add i32 %a, 1
+  ret i32 %tmp0
+}
+
+; CHECK: {{^}}kernel:
+define void @kernel(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @func(i32 1)
+  store i32 %tmp0, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}kernel2:
+define void @kernel2(i32 addrspace(1)* %out) {
+entry:
+  call void @kernel(i32 addrspace(1)* %out)
+  ret void
+}
diff --git a/test/CodeGen/R600/input-mods.ll b/test/CodeGen/R600/input-mods.ll
index 13bfbab..e3e9499 100644
--- a/test/CodeGen/R600/input-mods.ll
+++ b/test/CodeGen/R600/input-mods.ll
@@ -1,9 +1,9 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
 ;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
 
-;EG-CHECK-LABEL: @test
+;EG-CHECK-LABEL: {{^}}test:
 ;EG-CHECK: EXP_IEEE *
-;CM-CHECK-LABEL: @test
+;CM-CHECK-LABEL: {{^}}test:
 ;CM-CHECK: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X|
 ;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X|
 ;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|
diff --git a/test/CodeGen/R600/insert_subreg.ll b/test/CodeGen/R600/insert_subreg.ll
new file mode 100644
index 0000000..e311e19
--- /dev/null
+++ b/test/CodeGen/R600/insert_subreg.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=r600 -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
+
+; Test that INSERT_SUBREG instructions don't have non-register operands after
+; instruction selection.
+
+; Make sure this doesn't crash
+; CHECK-LABEL: test:
+define void @test(i64 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca [16 x i32]
+  %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32
+  %tmp2 = sext i32 %tmp1 to i64
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/insert_vector_elt.ll b/test/CodeGen/R600/insert_vector_elt.ll
index 43b4efc..857c414 100644
--- a/test/CodeGen/R600/insert_vector_elt.ll
+++ b/test/CodeGen/R600/insert_vector_elt.ll
@@ -8,116 +8,116 @@
 ; FIXME: Why is the constant moved into the intermediate register and
 ; not just directly into the vector component?
 
-; SI-LABEL: @insertelement_v4f32_0:
-; S_LOAD_DWORDX4 s{{[}}[[LOW_REG:[0-9]+]]:
-; V_MOV_B32_e32
-; V_MOV_B32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
-; V_MOV_B32_e32 v[[LOW_REG]], [[CONSTREG]]
-; BUFFER_STORE_DWORDX4 v{{[}}[[LOW_REG]]:
+; SI-LABEL: {{^}}insertelement_v4f32_0:
+; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]:
+; v_mov_b32_e32
+; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
+; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]]
+; buffer_store_dwordx4 v{{[}}[[LOW_REG]]:
 define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @insertelement_v4f32_1:
+; SI-LABEL: {{^}}insertelement_v4f32_1:
 define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @insertelement_v4f32_2:
+; SI-LABEL: {{^}}insertelement_v4f32_2:
 define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @insertelement_v4f32_3:
+; SI-LABEL: {{^}}insertelement_v4f32_3:
 define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @insertelement_v4i32_0:
+; SI-LABEL: {{^}}insertelement_v4i32_0:
 define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 999, i32 0
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v2f32:
-; SI: V_MOV_B32_e32 [[CONST:v[0-9]+]], 5.000000e+00
-; SI: V_MOVRELD_B32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
-; SI: BUFFER_STORE_DWORDX2 {{v\[}}[[LOW_RESULT_REG]]:
+; SI-LABEL: {{^}}dynamic_insertelement_v2f32:
+; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
   store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v4f32:
-; SI: V_MOV_B32_e32 [[CONST:v[0-9]+]], 5.000000e+00
-; SI: V_MOVRELD_B32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
-; SI: BUFFER_STORE_DWORDX4 {{v\[}}[[LOW_RESULT_REG]]:
+; SI-LABEL: {{^}}dynamic_insertelement_v4f32:
+; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
 define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v8f32:
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v8f32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
 define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
   store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v16f32:
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v16f32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
 define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
   store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v2i32:
-; SI: BUFFER_STORE_DWORDX2
+; SI-LABEL: {{^}}dynamic_insertelement_v2i32:
+; SI: buffer_store_dwordx2
 define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
   store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v4i32:
-; SI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v4i32:
+; SI: buffer_store_dwordx4
 define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 5, i32 %b
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v8i32:
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v8i32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
 define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
   store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v16i32:
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v16i32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
 define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
   store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
@@ -125,16 +125,16 @@ define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i
 }
 
 
-; SI-LABEL: @dynamic_insertelement_v2i16:
-; FIXMESI: BUFFER_STORE_DWORDX2
+; SI-LABEL: {{^}}dynamic_insertelement_v2i16:
+; FIXMESI: buffer_store_dwordx2
 define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v4i16:
-; FIXMESI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v4i16:
+; FIXMESI: buffer_store_dwordx4
 define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
   store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16
@@ -142,7 +142,7 @@ define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 }
 
 
-; SI-LABEL: @dynamic_insertelement_v2i8:
+; SI-LABEL: {{^}}dynamic_insertelement_v2i8:
 ; FIXMESI: BUFFER_STORE_USHORT
 define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
@@ -150,24 +150,24 @@ define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v4i8:
-; FIXMESI: BUFFER_STORE_DWORD
+; SI-LABEL: {{^}}dynamic_insertelement_v4i8:
+; FIXMESI: buffer_store_dword
 define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
   store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v8i8:
-; FIXMESI: BUFFER_STORE_DWORDX2
+; SI-LABEL: {{^}}dynamic_insertelement_v8i8:
+; FIXMESI: buffer_store_dwordx2
 define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
   store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v16i8:
-; FIXMESI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v16i8:
+; FIXMESI: buffer_store_dwordx4
 define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
   store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
@@ -176,7 +176,7 @@ define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8>
 
 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
 ; the compiler doesn't crash.
-; SI-LABEL: @insert_split_bb
+; SI-LABEL: {{^}}insert_split_bb:
 define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
 entry:
   %0 = insertelement <2 x i32> undef, i32 %a, i32 0
@@ -199,3 +199,53 @@ endif:
   store <2 x i32> %7, <2 x i32> addrspace(1)* %out
   ret void
 }
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2f64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
+  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2i64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
+  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v4f64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
+  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v8f64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
+  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/insert_vector_elt_f64.ll b/test/CodeGen/R600/insert_vector_elt_f64.ll
deleted file mode 100644
index 595bc59..0000000
--- a/test/CodeGen/R600/insert_vector_elt_f64.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; REQUIRES: asserts
-; XFAIL: *
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-
-
-; SI-LABEL: @dynamic_insertelement_v2f64:
-; SI: BUFFER_STORE_DWORDX4
-define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
-  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
-  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: @dynamic_insertelement_v2f64:
-; SI: BUFFER_STORE_DWORDX4
-define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
-  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
-  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: @dynamic_insertelement_v4f64:
-; SI: BUFFER_STORE_DWORDX4
-define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
-  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
-  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: @dynamic_insertelement_v8f64:
-; SI: BUFFER_STORE_DWORDX4
-define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
-  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
-  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
-  ret void
-}
diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll
index 0baa3cd..27840b2 100644
--- a/test/CodeGen/R600/kcache-fold.ll
+++ b/test/CodeGen/R600/kcache-fold.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: @main1
+; CHECK: {{^}}main1:
 ; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
 define void @main1() {
 main_body:
@@ -48,7 +48,7 @@ main_body:
   ret void
 }
 
-; CHECK: @main2
+; CHECK: {{^}}main2:
 ; CHECK-NOT: MOV
 define void @main2() {
 main_body:
diff --git a/test/CodeGen/R600/kernel-args.ll b/test/CodeGen/R600/kernel-args.ll
index 6fc6979..9a7da90 100644
--- a/test/CodeGen/R600/kernel-args.ll
+++ b/test/CodeGen/R600/kernel-args.ll
@@ -2,10 +2,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; EG-CHECK-LABEL: @i8_arg
+; EG-CHECK-LABEL: {{^}}i8_arg:
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i8_arg
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK-LABEL: {{^}}i8_arg:
+; SI-CHECK: buffer_load_ubyte
 
 define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
 entry:
@@ -14,10 +14,10 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i8_zext_arg
+; EG-CHECK-LABEL: {{^}}i8_zext_arg:
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i8_zext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}i8_zext_arg:
+; SI-CHECK: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 
 define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
 entry:
@@ -26,10 +26,10 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i8_sext_arg
+; EG-CHECK-LABEL: {{^}}i8_sext_arg:
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i8_sext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}i8_sext_arg:
+; SI-CHECK: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 
 define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
 entry:
@@ -38,10 +38,10 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i16_arg
+; EG-CHECK-LABEL: {{^}}i16_arg:
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i16_arg
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK-LABEL: {{^}}i16_arg:
+; SI-CHECK: buffer_load_ushort
 
 define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
 entry:
@@ -50,10 +50,10 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i16_zext_arg
+; EG-CHECK-LABEL: {{^}}i16_zext_arg:
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i16_zext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}i16_zext_arg:
+; SI-CHECK: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 
 define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
 entry:
@@ -62,10 +62,10 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i16_sext_arg
+; EG-CHECK-LABEL: {{^}}i16_sext_arg:
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i16_sext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}i16_sext_arg:
+; SI-CHECK: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 
 define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
 entry:
@@ -74,176 +74,176 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i32_arg
+; EG-CHECK-LABEL: {{^}}i32_arg:
 ; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i32_arg
-; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}i32_arg:
+; s_load_dword s{{[0-9]}}, s[0:1], 0xb
 define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
 entry:
   store i32 %in, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @f32_arg
+; EG-CHECK-LABEL: {{^}}f32_arg:
 ; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @f32_arg
-; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}f32_arg:
+; s_load_dword s{{[0-9]}}, s[0:1], 0xb
 define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
 entry:
   store float %in, float addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v2i8_arg
+; EG-CHECK-LABEL: {{^}}v2i8_arg:
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @v2i8_arg
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK-LABEL: {{^}}v2i8_arg:
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
 define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 entry:
   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v2i16_arg
+; EG-CHECK-LABEL: {{^}}v2i16_arg:
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: @v2i16_arg
-; SI-CHECK-DAG: BUFFER_LOAD_USHORT
-; SI-CHECK-DAG: BUFFER_LOAD_USHORT
+; SI-CHECK-LABEL: {{^}}v2i16_arg:
+; SI-CHECK-DAG: buffer_load_ushort
+; SI-CHECK-DAG: buffer_load_ushort
 define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v2i32_arg
+; EG-CHECK-LABEL: {{^}}v2i32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
-; SI-CHECK-LABEL: @v2i32_arg
-; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}v2i32_arg:
+; SI-CHECK: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v2f32_arg
+; EG-CHECK-LABEL: {{^}}v2f32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
-; SI-CHECK-LABEL: @v2f32_arg
-; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}v2f32_arg:
+; SI-CHECK: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
 entry:
   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v3i8_arg
+; EG-CHECK-LABEL: {{^}}v3i8_arg:
 ; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
 ; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
 ; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
-; SI-CHECK-LABEL: @v3i8_arg
+; SI-CHECK-LABEL: {{^}}v3i8_arg:
 define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v3i16_arg
+; EG-CHECK-LABEL: {{^}}v3i16_arg:
 ; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
 ; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
-; SI-CHECK-LABEL: @v3i16_arg
+; SI-CHECK-LABEL: {{^}}v3i16_arg:
 define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
 entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
   ret void
 }
-; EG-CHECK-LABEL: @v3i32_arg
+; EG-CHECK-LABEL: {{^}}v3i32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; SI-CHECK-LABEL: @v3i32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; SI-CHECK-LABEL: {{^}}v3i32_arg:
+; SI-CHECK: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
 entry:
   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v3f32_arg
+; EG-CHECK-LABEL: {{^}}v3f32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; SI-CHECK-LABEL: @v3f32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; SI-CHECK-LABEL: {{^}}v3f32_arg:
+; SI-CHECK: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
 entry:
   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v4i8_arg
+; EG-CHECK-LABEL: {{^}}v4i8_arg:
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @v4i8_arg
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK-LABEL: {{^}}v4i8_arg:
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
 define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v4i16_arg
+; EG-CHECK-LABEL: {{^}}v4i16_arg:
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: @v4i16_arg
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK-LABEL: {{^}}v4i16_arg:
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
 define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v4i32_arg
+; EG-CHECK-LABEL: {{^}}v4i32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
-; SI-CHECK-LABEL: @v4i32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; SI-CHECK-LABEL: {{^}}v4i32_arg:
+; SI-CHECK: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v4f32_arg
+; EG-CHECK-LABEL: {{^}}v4f32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
-; SI-CHECK-LABEL: @v4f32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; SI-CHECK-LABEL: {{^}}v4f32_arg:
+; SI-CHECK: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v8i8_arg
+; EG-CHECK-LABEL: {{^}}v8i8_arg:
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
@@ -252,21 +252,21 @@ entry:
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @v8i8_arg
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK-LABEL: {{^}}v8i8_arg:
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
 define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v8i16_arg
+; EG-CHECK-LABEL: {{^}}v8i16_arg:
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
@@ -275,22 +275,22 @@ entry:
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: @v8i16_arg
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK-LABEL: {{^}}v8i16_arg:
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
 define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
 entry:
   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v8i32_arg
+; EG-CHECK-LABEL: {{^}}v8i32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
@@ -299,15 +299,15 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI-CHECK-LABEL: @v8i32_arg
-; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; SI-CHECK-LABEL: {{^}}v8i32_arg:
+; SI-CHECK: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v8f32_arg
+; EG-CHECK-LABEL: {{^}}v8f32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
@@ -316,15 +316,15 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI-CHECK-LABEL: @v8f32_arg
-; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; SI-CHECK-LABEL: {{^}}v8f32_arg:
+; SI-CHECK: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v16i8_arg
+; EG-CHECK-LABEL: {{^}}v16i8_arg:
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
@@ -341,30 +341,30 @@ entry:
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @v16i8_arg
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK-LABEL: {{^}}v16i8_arg:
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
 define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v16i16_arg
+; EG-CHECK-LABEL: {{^}}v16i16_arg:
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
@@ -381,30 +381,30 @@ entry:
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: @v16i16_arg
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK-LABEL: {{^}}v16i16_arg:
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
 define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
 entry:
   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v16i32_arg
+; EG-CHECK-LABEL: {{^}}v16i32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
@@ -421,15 +421,15 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI-CHECK-LABEL: @v16i32_arg
-; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; SI-CHECK-LABEL: {{^}}v16i32_arg:
+; SI-CHECK: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v16f32_arg
+; EG-CHECK-LABEL: {{^}}v16f32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
@@ -446,10 +446,28 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI-CHECK-LABEL: @v16f32_arg
-; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; SI-CHECK-LABEL: {{^}}v16f32_arg:
+; SI-CHECK: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: {{^}}kernel_arg_i64:
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI: buffer_store_dwordx2
+define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+  store i64 %a, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
+; XSI: s_load_dwordx2
+; XSI: s_load_dwordx2
+; XSI: buffer_store_dwordx2
+; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
+;   ret void
+; }
diff --git a/test/CodeGen/R600/large-constant-initializer.ll b/test/CodeGen/R600/large-constant-initializer.ll
index 552cd05..5612dd3 100644
--- a/test/CodeGen/R600/large-constant-initializer.ll
+++ b/test/CodeGen/R600/large-constant-initializer.ll
@@ -1,6 +1,5 @@
-; XFAIL: *
-; REQUIRES: asserts
 ; RUN: llc -march=r600 -mcpu=SI < %s
+; CHECK: s_endpgm
 
 @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
 
diff --git a/test/CodeGen/R600/lds-initializer.ll b/test/CodeGen/R600/lds-initializer.ll
new file mode 100644
index 0000000..91d5d12
--- /dev/null
+++ b/test/CodeGen/R600/lds-initializer.ll
@@ -0,0 +1,12 @@
+; RUN: not llc -march=r600 -mcpu=SI < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported initializer for address space in load_init_lds_global
+
+@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
+
+define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
+ %gep = getelementptr [8 x i32] addrspace(3)* @lds, i32 0, i32 10
+  %ld = load i32 addrspace(3)* %gep
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/lds-oqap-crash.ll b/test/CodeGen/R600/lds-oqap-crash.ll
index 7959150..fbcd778 100644
--- a/test/CodeGen/R600/lds-oqap-crash.ll
+++ b/test/CodeGen/R600/lds-oqap-crash.ll
@@ -9,7 +9,7 @@
 ; because the LDS instructions are pseudo instructions and the OQAP
 ; reads and writes are bundled together in the same instruction.
 
-; CHECK: @lds_crash
+; CHECK: {{^}}lds_crash:
 define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = load i32 addrspace(3)* %in
diff --git a/test/CodeGen/R600/lds-output-queue.ll b/test/CodeGen/R600/lds-output-queue.ll
index d5dc061..cda75b0 100644
--- a/test/CodeGen/R600/lds-output-queue.ll
+++ b/test/CodeGen/R600/lds-output-queue.ll
@@ -3,12 +3,12 @@
 ; This test checks that the lds input queue will is empty at the end of
 ; the ALU clause.
 
-; CHECK-LABEL: @lds_input_queue
+; CHECK-LABEL: {{^}}lds_input_queue:
 ; CHECK: LDS_READ_RET * OQAP
 ; CHECK-NOT: ALU clause
 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
 
-@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] [i32 1, i32 2], align 4
+@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
 
 define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
 entry:
@@ -84,7 +84,7 @@ declare void @llvm.AMDGPU.barrier.local()
 ; analysis, we should be able to keep these instructions sparate before
 ; scheduling.
 ;
-; CHECK-LABEL: @local_global_alias
+; CHECK-LABEL: {{^}}local_global_alias:
 ; CHECK: LDS_READ_RET
 ; CHECK-NOT: ALU clause
 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
diff --git a/test/CodeGen/R600/lds-size.ll b/test/CodeGen/R600/lds-size.ll
index 9182e25..5287723 100644
--- a/test/CodeGen/R600/lds-size.ll
+++ b/test/CodeGen/R600/lds-size.ll
@@ -3,10 +3,10 @@
 ; This test makes sure we do not double count global values when they are
 ; used in different basic blocks.
 
-; CHECK-LABEL: @test
+; CHECK-LABEL: {{^}}test:
 ; CHECK: .long   166120
 ; CHECK-NEXT: .long   1
-@lds = internal unnamed_addr addrspace(3) global i32 zeroinitializer, align 4
+@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
 
 define void @test(i32 addrspace(1)* %out, i32 %cond) {
 entry:
diff --git a/test/CodeGen/R600/lds-zero-initializer.ll b/test/CodeGen/R600/lds-zero-initializer.ll
new file mode 100644
index 0000000..23912a9
--- /dev/null
+++ b/test/CodeGen/R600/lds-zero-initializer.ll
@@ -0,0 +1,12 @@
+; RUN: not llc -march=r600 -mcpu=SI < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported initializer for address space in load_zeroinit_lds_global
+
+@lds = addrspace(3) global [256 x i32] zeroinitializer
+
+define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
+ %gep = getelementptr [256 x i32] addrspace(3)* @lds, i32 0, i32 10
+  %ld = load i32 addrspace(3)* %gep
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll b/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll
index 1aae7f9..b9fa8e9 100644
--- a/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll
+++ b/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll
@@ -8,7 +8,7 @@
 ; instructions, when only one is needed.
 ;
 
-; CHECK: @setcc_expand
+; CHECK: {{^}}setcc_expand:
 ; CHECK: SET
 ; CHECK-NOT: CND
 define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
index 47191e0..cff1c24 100644
--- a/test/CodeGen/R600/literals.ll
+++ b/test/CodeGen/R600/literals.ll
@@ -6,7 +6,7 @@
 ; or
 ; ADD_INT literal.x KC0[2].Z, 5
 
-; CHECK: @i32_literal
+; CHECK: {{^}}i32_literal:
 ; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5
@@ -23,7 +23,7 @@ entry:
 ; or
 ; ADD literal.x KC0[2].Z, 5.0
 
-; CHECK: @float_literal
+; CHECK: {{^}}float_literal:
 ; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.0
@@ -35,7 +35,7 @@ entry:
 }
 
 ; Make sure inline literals are folded into REG_SEQUENCE instructions.
-; CHECK: @inline_literal_reg_sequence
+; CHECK: {{^}}inline_literal_reg_sequence:
 ; CHECK: MOV {{\** *}}T[[GPR:[0-9]]].X, 0.0
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Y, 0.0
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0
@@ -47,7 +47,7 @@ entry:
   ret void
 }
 
-; CHECK: @inline_literal_dot4
+; CHECK: {{^}}inline_literal_dot4:
 ; CHECK: DOT4 T[[GPR:[0-9]]].X, 1.0
 ; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0
 ; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0
diff --git a/test/CodeGen/R600/llvm.AMDGPU.abs.ll b/test/CodeGen/R600/llvm.AMDGPU.abs.ll
index a0a47b7..b4aede8 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.abs.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.abs.ll
@@ -6,10 +6,10 @@ declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone
 ; Legacy name
 declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone
 
-; FUNC-LABEL: @s_abs_i32
-; SI: S_SUB_I32
-; SI: S_MAX_I32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_abs_i32:
+; SI: s_sub_i32
+; SI: s_max_i32
+; SI: s_endpgm
 
 ; EG: SUB_INT
 ; EG: MAX_INT
@@ -19,10 +19,10 @@ define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @v_abs_i32
-; SI: V_SUB_I32_e32
-; SI: V_MAX_I32_e32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_abs_i32:
+; SI: v_sub_i32_e32
+; SI: v_max_i32_e32
+; SI: s_endpgm
 
 ; EG: SUB_INT
 ; EG: MAX_INT
@@ -33,10 +33,10 @@ define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind
   ret void
 }
 
-; FUNC-LABEL: @abs_i32_legacy_amdil
-; SI: V_SUB_I32_e32
-; SI: V_MAX_I32_e32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}abs_i32_legacy_amdil:
+; SI: v_sub_i32_e32
+; SI: v_max_i32_e32
+; SI: s_endpgm
 
 ; EG: SUB_INT
 ; EG: MAX_INT
diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll
new file mode 100644
index 0000000..98f6695
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_barrier_global:
+; EG: GROUP_BARRIER
+; SI: s_barrier
+
+define void @test_barrier_global(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x()
+  %1 = getelementptr i32 addrspace(1)* %out, i32 %0
+  store i32 %0, i32 addrspace(1)* %1
+  call void @llvm.AMDGPU.barrier.global()
+  %2 = call i32 @llvm.r600.read.local.size.x()
+  %3 = sub i32 %2, 1
+  %4 = sub i32 %3, %0
+  %5 = getelementptr i32 addrspace(1)* %out, i32 %4
+  %6 = load i32 addrspace(1)* %5
+  store i32 %6, i32 addrspace(1)* %1
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.global()
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.r600.read.local.size.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
index 8d3c9ca..92fe9f2 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
@@ -1,8 +1,11 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-; CHECK: GROUP_BARRIER
+; FUNC-LABEL: {{^}}test_barrier_local:
+; EG: GROUP_BARRIER
+; SI: s_barrier
 
-define void @test(i32 addrspace(1)* %out) {
+define void @test_barrier_local(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x()
   %1 = getelementptr i32 addrspace(1)* %out, i32 %0
@@ -17,8 +20,9 @@ entry:
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #0
 declare void @llvm.AMDGPU.barrier.local()
+
+declare i32 @llvm.r600.read.tidig.x() #0
 declare i32 @llvm.r600.read.local.size.x() #0
 
 attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
index eb50942..0b60d0d 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
@@ -3,8 +3,8 @@
 
 declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
 
-; FUNC-LABEL: @bfe_i32_arg_arg_arg
-; SI: V_BFE_I32
+; FUNC-LABEL: {{^}}bfe_i32_arg_arg_arg:
+; SI: v_bfe_i32
 ; EG: BFE_INT
 ; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac
 define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
@@ -13,8 +13,8 @@ define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_arg_arg_imm
-; SI: V_BFE_I32
+; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm:
+; SI: v_bfe_i32
 ; EG: BFE_INT
 define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone
@@ -22,8 +22,8 @@ define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) n
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_arg_imm_arg
-; SI: V_BFE_I32
+; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg:
+; SI: v_bfe_i32
 ; EG: BFE_INT
 define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone
@@ -31,8 +31,8 @@ define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) n
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_imm_arg_arg
-; SI: V_BFE_I32
+; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg:
+; SI: v_bfe_i32
 ; EG: BFE_INT
 define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone
@@ -40,8 +40,8 @@ define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) n
   ret void
 }
 
-; FUNC-LABEL: @v_bfe_print_arg
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
+; FUNC-LABEL: {{^}}v_bfe_print_arg:
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
 define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
   %load = load i32 addrspace(1)* %src0, align 4
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone
@@ -49,9 +49,9 @@ define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) no
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_arg_0_width_reg_offset
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone
@@ -59,9 +59,9 @@ define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_arg_0_width_imm_offset
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone
@@ -69,10 +69,10 @@ define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_6
-; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_6:
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: s_endpgm
 define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -81,12 +81,12 @@ define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_7
-; SI-NOT: SHL
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_7:
+; SI-NOT: shl
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -96,10 +96,10 @@ define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 }
 
 ; FIXME: The shifts should be 1 BFE
-; FUNC-LABEL: @bfe_i32_test_8
-; SI: BUFFER_LOAD_DWORD
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_8:
+; SI: buffer_load_dword
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: s_endpgm
 define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -108,11 +108,11 @@ define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_9
-; SI-NOT: BFE
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
@@ -120,11 +120,11 @@ define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_10
-; SI-NOT: BFE
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
@@ -132,11 +132,11 @@ define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_11
-; SI-NOT: BFE
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
@@ -144,11 +144,11 @@ define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_12
-; SI-NOT: BFE
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
@@ -156,10 +156,10 @@ define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_13
-; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_13:
+; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
@@ -167,10 +167,10 @@ define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
   store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_14
-; SI-NOT: LSHR
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_14:
+; SI-NOT: lshr
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
@@ -178,11 +178,11 @@ define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
   store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_0
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_0:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone
@@ -190,11 +190,11 @@ define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_1
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_1:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone
@@ -202,11 +202,11 @@ define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_2
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_2:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone
@@ -214,11 +214,11 @@ define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_3
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_3:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone
@@ -226,11 +226,11 @@ define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_4
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_4:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone
@@ -238,11 +238,11 @@ define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_5
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_5:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone
@@ -250,11 +250,11 @@ define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_6
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0xffffff80
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_6:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone
@@ -262,11 +262,11 @@ define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_7
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_7:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone
@@ -274,11 +274,11 @@ define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_8
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_8:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone
@@ -286,11 +286,11 @@ define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_9
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone
@@ -298,11 +298,11 @@ define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_10
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone
@@ -310,11 +310,11 @@ define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_11
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -6
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -6
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone
@@ -322,11 +322,11 @@ define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_12
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone
@@ -334,11 +334,11 @@ define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_13
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_13:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone
@@ -346,11 +346,11 @@ define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_14
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 40
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_14:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone
@@ -358,11 +358,11 @@ define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_15
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_15:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone
@@ -370,11 +370,11 @@ define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_16
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_16:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone
@@ -382,11 +382,11 @@ define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_17
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_17:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone
@@ -394,11 +394,11 @@ define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_18
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_18:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone
@@ -408,14 +408,14 @@ define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
 
 ; XXX - This should really be a single BFE, but the sext_inreg of the
 ; extended type i24 is never custom lowered.
-; FUNC-LABEL: @bfe_sext_in_reg_i24
-; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
-; SI: V_LSHLREV_B32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
-; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
-; XSI: V_BFE_I32 [[BFE:v[0-9]+]], [[LOAD]], 0, 8
+; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24:
+; SI: buffer_load_dword [[LOAD:v[0-9]+]],
+; SI: v_lshlrev_b32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
+; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
+; XSI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 8
 ; XSI-NOT: SHL
 ; XSI-NOT: SHR
-; XSI: BUFFER_STORE_DWORD [[BFE]],
+; XSI: buffer_store_dword [[BFE]],
 define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
@@ -424,3 +424,18 @@ define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
   store i32 %ashr, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: @simplify_demanded_bfe_sdiv
+; SI: buffer_load_dword [[LOAD:v[0-9]+]]
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16
+; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]]
+; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], [[TMP0]], [[BFE]]
+; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
+; SI: buffer_store_dword [[TMP2]]
+define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %src = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone
+  %div = sdiv i32 %bfe, 2
+  store i32 %div, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
index 1a62253..0794ac4 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
@@ -3,8 +3,8 @@
 
 declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
 
-; FUNC-LABEL: @bfe_u32_arg_arg_arg
-; SI: V_BFE_U32
+; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg:
+; SI: v_bfe_u32
 ; EG: BFE_UINT
 define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
@@ -12,8 +12,8 @@ define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_arg_arg_imm
-; SI: V_BFE_U32
+; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm:
+; SI: v_bfe_u32
 ; EG: BFE_UINT
 define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone
@@ -21,8 +21,8 @@ define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) n
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_arg_imm_arg
-; SI: V_BFE_U32
+; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg:
+; SI: v_bfe_u32
 ; EG: BFE_UINT
 define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone
@@ -30,8 +30,8 @@ define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) n
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_imm_arg_arg
-; SI: V_BFE_U32
+; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg:
+; SI: v_bfe_u32
 ; EG: BFE_UINT
 define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone
@@ -39,9 +39,9 @@ define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) n
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_arg_0_width_reg_offset
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone
@@ -49,9 +49,9 @@ define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_arg_0_width_imm_offset
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone
@@ -59,10 +59,10 @@ define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zextload_i8
-; SI: BUFFER_LOAD_UBYTE
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zextload_i8:
+; SI: buffer_load_ubyte
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %load = load i8 addrspace(1)* %in
   %ext = zext i8 %load to i32
@@ -71,12 +71,12 @@ define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) n
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zext_in_reg_i8
-; SI: BUFFER_LOAD_DWORD
-; SI: V_ADD_I32
-; SI-NEXT: V_AND_B32_e32
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
@@ -86,12 +86,12 @@ define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %i
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zext_in_reg_i16
-; SI: BUFFER_LOAD_DWORD
-; SI: V_ADD_I32
-; SI-NEXT: V_AND_B32_e32
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
@@ -101,11 +101,11 @@ define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_1
-; SI: BUFFER_LOAD_DWORD
-; SI: V_ADD_I32
-; SI: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI: bfe
+; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
@@ -115,12 +115,12 @@ define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspa
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_3
-; SI: BUFFER_LOAD_DWORD
-; SI: V_ADD_I32
-; SI-NEXT: V_AND_B32_e32 {{v[0-9]+}}, 0xf8
-; SI-NEXT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
+; SI-NEXT: bfe
+; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
@@ -130,12 +130,12 @@ define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspa
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_7
-; SI: BUFFER_LOAD_DWORD
-; SI: V_ADD_I32
-; SI-NEXT: V_AND_B32_e32 {{v[0-9]+}}, 0x80
-; SI-NEXT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
+; SI-NEXT: bfe
+; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
@@ -145,11 +145,11 @@ define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspa
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zext_in_reg_i16_offset_8
-; SI: BUFFER_LOAD_DWORD
-; SI: V_ADD_I32
-; SI-NEXT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: bfe
+; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
@@ -159,10 +159,10 @@ define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrsp
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_1
-; SI: BUFFER_LOAD_DWORD
-; SI: V_AND_B32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_1:
+; SI: buffer_load_dword
+; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI: s_endpgm
 ; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
 define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
@@ -187,13 +187,13 @@ define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_4
-; SI-NOT: LSHL
-; SI-NOT: SHR
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_4:
+; SI-NOT: lshl
+; SI-NOT: shr
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -203,12 +203,12 @@ define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_5
-; SI: BUFFER_LOAD_DWORD
-; SI-NOT: LSHL
-; SI-NOT: SHR
-; SI: V_BFE_I32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_5:
+; SI: buffer_load_dword
+; SI-NOT: lshl
+; SI-NOT: shr
+; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
+; SI: s_endpgm
 define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -218,10 +218,10 @@ define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_6
-; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_6:
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: s_endpgm
 define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -230,10 +230,10 @@ define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_7
-; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_7:
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -242,11 +242,11 @@ define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_8
-; SI-NOT: BFE
-; SI: V_AND_B32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_8:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -255,11 +255,11 @@ define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_9
-; SI-NOT: BFE
-; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
@@ -267,11 +267,11 @@ define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_10
-; SI-NOT: BFE
-; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
@@ -279,11 +279,11 @@ define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_11
-; SI-NOT: BFE
-; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
@@ -291,11 +291,11 @@ define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_12
-; SI-NOT: BFE
-; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
@@ -303,10 +303,10 @@ define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_13
+; FUNC-LABEL: {{^}}bfe_u32_test_13:
 ; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
@@ -314,10 +314,10 @@ define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
   store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_14
-; SI-NOT: LSHR
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_14:
+; SI-NOT: lshr
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
@@ -325,11 +325,11 @@ define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
   store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_0
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_0:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone
@@ -337,11 +337,11 @@ define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_1
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_1:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone
@@ -349,11 +349,11 @@ define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_2
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_2:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone
@@ -361,11 +361,11 @@ define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_3
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_3:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone
@@ -373,11 +373,11 @@ define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_4
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_4:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone
@@ -385,11 +385,11 @@ define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_5
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_5:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone
@@ -397,11 +397,11 @@ define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_6
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x80
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_6:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone
@@ -409,11 +409,11 @@ define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_7
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_7:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone
@@ -421,11 +421,11 @@ define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_8
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_8:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone
@@ -433,11 +433,11 @@ define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_9
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFEfppppppppppppp
 define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
@@ -445,11 +445,11 @@ define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_10
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone
@@ -457,11 +457,11 @@ define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_11
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone
@@ -469,11 +469,11 @@ define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_12
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone
@@ -481,11 +481,11 @@ define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_13
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_13:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone
@@ -493,11 +493,11 @@ define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_14
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 40
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_14:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone
@@ -505,11 +505,11 @@ define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_15
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_15:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone
@@ -517,11 +517,11 @@ define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_16
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_16:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone
@@ -529,11 +529,11 @@ define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_17
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_17:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone
@@ -541,14 +541,36 @@ define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_18
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_18:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; Make sure that SimplifyDemandedBits doesn't cause the and to be
+; reduced to the bits demanded by the bfe.
+
+; XXX: The operand to v_bfe_u32 could also just directly be the load register.
+; FUNC-LABEL: {{^}}simplify_bfe_u32_multi_use_arg:
+; SI: buffer_load_dword [[ARG:v[0-9]+]]
+; SI: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]]
+; SI: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2
+; SI-DAG: buffer_store_dword [[AND]]
+; SI-DAG: buffer_store_dword [[BFE]]
+; SI: s_endpgm
+define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
+                                            i32 addrspace(1)* %out1,
+                                            i32 addrspace(1)* %in) nounwind {
+  %src = load i32 addrspace(1)* %in, align 4
+  %and = and i32 %src, 63
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4
+  store i32 %and, i32 addrspace(1)* %out1, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfi.ll b/test/CodeGen/R600/llvm.AMDGPU.bfi.ll
index e1de45b..df61b0b 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfi.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfi.ll
@@ -3,8 +3,8 @@
 
 declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone
 
-; FUNC-LABEL: @bfi_arg_arg_arg
-; SI: V_BFI_B32
+; FUNC-LABEL: {{^}}bfi_arg_arg_arg:
+; SI: v_bfi_b32
 ; EG: BFI_INT
 define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
   %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
@@ -12,8 +12,8 @@ define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %
   ret void
 }
 
-; FUNC-LABEL: @bfi_arg_arg_imm
-; SI: V_BFI_B32
+; FUNC-LABEL: {{^}}bfi_arg_arg_imm:
+; SI: v_bfi_b32
 ; EG: BFI_INT
 define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 123) nounwind readnone
@@ -21,8 +21,8 @@ define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounw
   ret void
 }
 
-; FUNC-LABEL: @bfi_arg_imm_arg
-; SI: V_BFI_B32
+; FUNC-LABEL: {{^}}bfi_arg_imm_arg:
+; SI: v_bfi_b32
 ; EG: BFI_INT
 define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
   %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 123, i32 %src2) nounwind readnone
@@ -30,8 +30,8 @@ define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounw
   ret void
 }
 
-; FUNC-LABEL: @bfi_imm_arg_arg
-; SI: V_BFI_B32
+; FUNC-LABEL: {{^}}bfi_imm_arg_arg:
+; SI: v_bfi_b32
 ; EG: BFI_INT
 define void @bfi_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
   %bfi = call i32 @llvm.AMDGPU.bfi(i32 123, i32 %src1, i32 %src2) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll b/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
index ef8721e..0ba4af5 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
@@ -3,8 +3,8 @@
 
 declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone
 
-; FUNC-LABEL: @bfm_arg_arg
-; SI: V_BFM
+; FUNC-LABEL: {{^}}bfm_arg_arg:
+; SI: v_bfm
 ; EG: BFM_INT
 define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone
@@ -12,8 +12,8 @@ define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind
   ret void
 }
 
-; FUNC-LABEL: @bfm_arg_imm
-; SI: V_BFM
+; FUNC-LABEL: {{^}}bfm_arg_imm:
+; SI: v_bfm
 ; EG: BFM_INT
 define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind {
   %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone
@@ -21,8 +21,8 @@ define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfm_imm_arg
-; SI: V_BFM
+; FUNC-LABEL: {{^}}bfm_imm_arg:
+; SI: v_bfm
 ; EG: BFM_INT
 define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind {
   %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone
@@ -30,8 +30,8 @@ define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @bfm_imm_imm
-; SI: V_BFM
+; FUNC-LABEL: {{^}}bfm_imm_imm:
+; SI: v_bfm
 ; EG: BFM_INT
 define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind {
   %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.brev.ll b/test/CodeGen/R600/llvm.AMDGPU.brev.ll
index 68a5ad0..647df34 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.brev.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.brev.ll
@@ -2,23 +2,23 @@
 
 declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone
 
-; FUNC-LABEL: @s_brev_i32:
-; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
-; SI: S_BREV_B32 [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_brev_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
 define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @v_brev_i32:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_BFREV_B32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_brev_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.clamp.ll b/test/CodeGen/R600/llvm.AMDGPU.clamp.ll
index d608953..c6efdb9 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.clamp.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.clamp.ll
@@ -1,14 +1,15 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
+declare float @llvm.fabs.f32(float) nounwind readnone
 declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone
 declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone
 
-; FUNC-LABEL: @clamp_0_1_f32
-; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
-; SI: V_ADD_F32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0, 1, 0
-; SI: BUFFER_STORE_DWORD [[RESULT]]
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}clamp_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
 
 ; EG: MOV_SAT
 define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
@@ -17,10 +18,47 @@ define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @clamp_0_1_amdil_legacy_f32
-; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
-; SI: V_ADD_F32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0, 1, 0
-; SI: BUFFER_STORE_DWORD [[RESULT]]
+; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, |[[ARG]]| clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fabs, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -[[ARG]] clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %src.fneg = fsub float -0.0, %src
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -|[[ARG]]| clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
+  %src.fneg.fabs = fsub float -0.0, %src.fabs
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg.fabs, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_0_1_amdil_legacy_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
 define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
   %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
   store float %clamp, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/llvm.AMDGPU.cube.ll b/test/CodeGen/R600/llvm.AMDGPU.cube.ll
index 110bbfd..aa07afd 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.cube.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.cube.ll
@@ -1,7 +1,7 @@
 
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: @cube
+; CHECK: {{^}}cube:
 ; CHECK: CUBE T{{[0-9]}}.X
 ; CHECK: CUBE T{{[0-9]}}.Y
 ; CHECK: CUBE T{{[0-9]}}.Z
diff --git a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
index 6facb47..7aacbb9 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
@@ -5,8 +5,8 @@ declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone
 declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone
 declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone
 
-; SI-LABEL: @test_unpack_byte0_to_float:
-; SI: V_CVT_F32_UBYTE0
+; SI-LABEL: {{^}}test_unpack_byte0_to_float:
+; SI: v_cvt_f32_ubyte0
 define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone
@@ -14,8 +14,8 @@ define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(
   ret void
 }
 
-; SI-LABEL: @test_unpack_byte1_to_float:
-; SI: V_CVT_F32_UBYTE1
+; SI-LABEL: {{^}}test_unpack_byte1_to_float:
+; SI: v_cvt_f32_ubyte1
 define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone
@@ -23,8 +23,8 @@ define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(
   ret void
 }
 
-; SI-LABEL: @test_unpack_byte2_to_float:
-; SI: V_CVT_F32_UBYTE2
+; SI-LABEL: {{^}}test_unpack_byte2_to_float:
+; SI: v_cvt_f32_ubyte2
 define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone
@@ -32,8 +32,8 @@ define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(
   ret void
 }
 
-; SI-LABEL: @test_unpack_byte3_to_float:
-; SI: V_CVT_F32_UBYTE3
+; SI-LABEL: {{^}}test_unpack_byte3_to_float:
+; SI: v_cvt_f32_ubyte3
 define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
index c8c7357..009fd73 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
@@ -3,23 +3,23 @@
 declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone
 declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone
 
-; SI-LABEL: @test_div_fixup_f32:
-; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: V_DIV_FIXUP_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}test_div_fixup_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: @test_div_fixup_f64:
-; SI: V_DIV_FIXUP_F64
+; SI-LABEL: {{^}}test_div_fixup_f64:
+; SI: v_div_fixup_f64
 define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
   %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
index 4f1e827..dcca9e9 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
@@ -1,27 +1,27 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare float @llvm.AMDGPU.div.fmas.f32(float, float, float) nounwind readnone
-declare double @llvm.AMDGPU.div.fmas.f64(double, double, double) nounwind readnone
+declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone
+declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
 
-; SI-LABEL: @test_div_fmas_f32:
-; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: V_DIV_FMAS_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
-define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c) nounwind readnone
+; SI-LABEL: {{^}}test_div_fmas_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: @test_div_fmas_f64:
-; SI: V_DIV_FMAS_F64
-define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
-  %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c) nounwind readnone
+; SI-LABEL: {{^}}test_div_fmas_f64:
+; SI: v_div_fmas_f64
+define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
+  %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
index 527c8da..641c8ca 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
@@ -1,13 +1,23 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone
 declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone
 
 ; SI-LABEL @test_div_scale_f32_1:
-; SI: V_DIV_SCALE_F32
-define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind {
-  %a = load float addrspace(1)* %aptr, align 4
-  %b = load float addrspace(1)* %bptr, align 4
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
   %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -15,10 +25,19 @@ define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)*
 }
 
 ; SI-LABEL @test_div_scale_f32_2:
-; SI: V_DIV_SCALE_F32
-define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind {
-  %a = load float addrspace(1)* %aptr, align 4
-  %b = load float addrspace(1)* %bptr, align 4
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
   %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -26,10 +45,19 @@ define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)*
 }
 
 ; SI-LABEL @test_div_scale_f64_1:
-; SI: V_DIV_SCALE_F64
-define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind {
-  %a = load double addrspace(1)* %aptr, align 8
-  %b = load double addrspace(1)* %bptr, align 8
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
   %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
@@ -37,10 +65,221 @@ define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)
 }
 
 ; SI-LABEL @test_div_scale_f64_1:
-; SI: V_DIV_SCALE_F64
-define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind {
-  %a = load double addrspace(1)* %aptr, align 8
-  %b = load double addrspace(1)* %bptr, align 8
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_num_1:
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
+; SI-DAG: s_load_dword [[A:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float addrspace(1)* %in, i32 %tid
+
+  %b = load float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_num_2:
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
+; SI-DAG: s_load_dword [[A:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float addrspace(1)* %in, i32 %tid
+
+  %b = load float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_den_1:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
+; SI-DAG: s_load_dword [[B:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float addrspace(1)* %in, i32 %tid
+
+  %a = load float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_den_2:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
+; SI-DAG: s_load_dword [[B:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float addrspace(1)* %in, i32 %tid
+
+  %a = load float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_num_1:
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+
+  %b = load double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_num_2:
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+
+  %b = load double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_den_1:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+
+  %a = load double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_den_2:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+
+  %a = load double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_all_scalar_1:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_all_scalar_2:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_all_scalar_1:
+; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
+; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_all_scalar_2:
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
+; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
   %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.ll
index 72ec1c5..235068c 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.fract.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.fract.ll
@@ -6,8 +6,8 @@ declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone
 ; Legacy name
 declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone
 
-; FUNC-LABEL: @fract_f32
-; SI: V_FRACT_F32
+; FUNC-LABEL: {{^}}fract_f32:
+; SI: v_fract_f32
 ; EG: FRACT
 define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
   %val = load float addrspace(1)* %src, align 4
@@ -16,8 +16,8 @@ define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounw
   ret void
 }
 
-; FUNC-LABEL: @fract_f32_legacy_amdil
-; SI: V_FRACT_F32
+; FUNC-LABEL: {{^}}fract_f32_legacy_amdil:
+; SI: v_fract_f32
 ; EG: FRACT
 define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
   %val = load float addrspace(1)* %src, align 4
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
index 95795ea..8998840 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
@@ -8,8 +8,8 @@
 
 declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone
 
-; FUNC-LABEL: @test_imad24
-; SI: V_MAD_I32_I24
+; FUNC-LABEL: {{^}}test_imad24:
+; SI: v_mad_i32_i24
 ; CM: MULADD_INT24
 ; R600: MULLO_INT
 ; R600: ADD_INT
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imax.ll b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
index 01c9f43..dac21a4 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @vector_imax
-; SI: V_MAX_I32_e32
+; SI-LABEL: {{^}}vector_imax:
+; SI: v_max_i32_e32
 define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
   %load = load i32 addrspace(1)* %in, align 4
@@ -11,8 +11,8 @@ main_body:
   ret void
 }
 
-; SI-LABEL: @scalar_imax
-; SI: S_MAX_I32
+; SI-LABEL: {{^}}scalar_imax:
+; SI: s_max_i32
 define void @scalar_imax(i32 %p0, i32 %p1) #0 {
 entry:
   %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1)
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imin.ll b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
index 565bf34..462c497 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @vector_imin
-; SI: V_MIN_I32_e32
+; SI-LABEL: {{^}}vector_imin:
+; SI: v_min_i32_e32
 define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
   %load = load i32 addrspace(1)* %in, align 4
@@ -11,8 +11,8 @@ main_body:
   ret void
 }
 
-; SI-LABEL: @scalar_imin
-; SI: S_MIN_I32
+; SI-LABEL: {{^}}scalar_imin:
+; SI: s_min_i32
 define void @scalar_imin(i32 %p0, i32 %p1) #0 {
 entry:
   %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1)
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
index 8ee3520..db563dd 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
@@ -4,8 +4,8 @@
 
 declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
 
-; FUNC-LABEL: @test_imul24
-; SI: V_MUL_I32_I24
+; FUNC-LABEL: {{^}}test_imul24:
+; SI: v_mul_i32_i24
 ; CM: MUL_INT24
 ; R600: MULLO_INT
 define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
diff --git a/test/CodeGen/R600/llvm.AMDGPU.kill.ll b/test/CodeGen/R600/llvm.AMDGPU.kill.ll
index 4ab6a8a..988b43c 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.kill.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.kill.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @kill_gs_const
-; SI-NOT: V_CMPX_LE_F32
-; SI: S_MOV_B64 exec, 0
+; SI-LABEL: {{^}}kill_gs_const:
+; SI-NOT: v_cmpx_le_f32
+; SI: s_mov_b64 exec, 0
 
 define void @kill_gs_const() #0 {
 main_body:
diff --git a/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll b/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll
new file mode 100644
index 0000000..72719fe
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone
+declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone
+
+; SI-LABEL: {{^}}test_ldexp_f32:
+; SI: v_ldexp_f32
+; SI: s_endpgm
+define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
+  %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_ldexp_f64:
+; SI: v_ldexp_f64
+; SI: s_endpgm
+define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
+  %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
index 51964ee..6e3fa25 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
@@ -3,8 +3,8 @@
 
 declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone
 
-; FUNC-LABEL: @rsq_legacy_f32
-; SI: V_RSQ_LEGACY_F32_e32
+; FUNC-LABEL: {{^}}rsq_legacy_f32:
+; SI: v_rsq_legacy_f32_e32
 ; EG: RECIPSQRT_IEEE
 define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
   %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll
new file mode 100644
index 0000000..c4b04c5
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+; FUNC-LABEL: {{^}}rcp_f64:
+; SI: v_rcp_f64_e32
+define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_pat_f64:
+; SI: v_rcp_f64_e32
+define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rcp = fdiv double 1.0, %src
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_rcp_pat_f64:
+; SI-UNSAFE: v_rsq_f64_e32
+; SI-SAFE-NOT: v_rsq_f64_e32
+define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
+  %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
+  %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.ll
index ca5260d..3ee3e6b 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rcp.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rcp.ll
@@ -1,58 +1,47 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
+
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
 declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
 
-
 declare float @llvm.sqrt.f32(float) nounwind readnone
-declare double @llvm.sqrt.f64(double) nounwind readnone
 
-; FUNC-LABEL: @rcp_f32
-; SI: V_RCP_F32_e32
+; FUNC-LABEL: {{^}}rcp_f32:
+; SI: v_rcp_f32_e32
+; EG: RECIP_IEEE
 define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind {
   %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @rcp_f64
-; SI: V_RCP_F64_e32
-define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind {
-  %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone
-  store double %rcp, double addrspace(1)* %out, align 8
-  ret void
-}
+; FIXME: Evergreen only ever does unsafe fp math.
+; FUNC-LABEL: {{^}}rcp_pat_f32:
+
+; SI-SAFE: v_rcp_f32_e32
+; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32
+
+; EG: RECIP_IEEE
 
-; FUNC-LABEL: @rcp_pat_f32
-; SI: V_RCP_F32_e32
 define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @rcp_pat_f64
-; SI: V_RCP_F64_e32
-define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
-  %rcp = fdiv double 1.0, %src
-  store double %rcp, double addrspace(1)* %out, align 8
-  ret void
-}
+; FUNC-LABEL: {{^}}rsq_rcp_pat_f32:
+; SI-UNSAFE: v_rsq_f32_e32
+; SI-SAFE: v_sqrt_f32_e32
+; SI-SAFE: v_rcp_f32_e32
 
-; FUNC-LABEL: @rsq_rcp_pat_f32
-; SI: V_RSQ_F32_e32
+; EG: RECIPSQRT_IEEE
 define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
   %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone
   %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
-
-; FUNC-LABEL: @rsq_rcp_pat_f64
-; SI: V_RSQ_F64_e32
-define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
-  %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
-  %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
-  store double %rcp, double addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
index 100d6ff..18854be 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
@@ -2,8 +2,8 @@
 
 declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
 
-; FUNC-LABEL: @rsq_clamped_f64
-; SI: V_RSQ_CLAMP_F64_e32
+; FUNC-LABEL: {{^}}rsq_clamped_f64:
+; SI: v_rsq_clamp_f64_e32
 define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
   %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
   store double %rsq_clamped, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
index 683df73..6bf9f0c 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
@@ -4,8 +4,8 @@
 
 declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone
 
-; FUNC-LABEL: @rsq_clamped_f32
-; SI: V_RSQ_CLAMP_F32_e32
+; FUNC-LABEL: {{^}}rsq_clamped_f32:
+; SI: v_rsq_clamp_f32_e32
 ; EG: RECIPSQRT_CLAMPED
 define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
   %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.ll
index 27cf6b2..d6299b8 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.ll
@@ -3,11 +3,30 @@
 
 declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone
 
-; FUNC-LABEL: @rsq_f32
-; SI: V_RSQ_F32_e32
+; FUNC-LABEL: {{^}}rsq_f32:
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
 ; EG: RECIPSQRT_IEEE
 define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind {
   %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
 }
+
+; TODO: Really these should be constant folded
+; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
+; EG: RECIPSQRT_IEEE
+define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind {
+  %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
+; EG: RECIPSQRT_IEEE
+define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind {
+  %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
index 1c736d4..2e6bd5c 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
@@ -2,12 +2,12 @@
 
 declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
 
-; SI-LABEL: @test_trig_preop_f64:
-; SI-DAG: BUFFER_LOAD_DWORD [[SEG:v[0-9]+]]
-; SI-DAG: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]],
-; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}test_trig_preop_f64:
+; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]]
+; SI-DAG: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]],
+; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
 define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load double addrspace(1)* %aptr, align 8
   %b = load i32 addrspace(1)* %bptr, align 4
@@ -16,11 +16,11 @@ define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)*
   ret void
 }
 
-; SI-LABEL: @test_trig_preop_f64_imm_segment:
-; SI: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]],
-; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}test_trig_preop_f64_imm_segment:
+; SI: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]],
+; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
 define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %a = load double addrspace(1)* %aptr, align 8
   %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
index e6bb2c4..fdd531d 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-; R600-CHECK: @amdgpu_trunc
+; R600-CHECK: {{^}}amdgpu_trunc:
 ; R600-CHECK: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: @amdgpu_trunc
-; SI-CHECK: V_TRUNC_F32
+; SI-CHECK: {{^}}amdgpu_trunc:
+; SI-CHECK: v_trunc_f32
 
 define void @amdgpu_trunc(float addrspace(1)* %out, float %x) {
 entry:
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
index afdfb18..59d6248 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
@@ -5,9 +5,10 @@
 ; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-; FUNC-LABEL: @test_umad24
-; SI: V_MAD_U32_U24
+; FUNC-LABEL: {{^}}test_umad24:
+; SI: v_mad_u32_u24
 ; EG: MULADD_UINT24
 ; R600: MULLO_UINT
 ; R600: ADD_INT
@@ -17,3 +18,21 @@ define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2
   ret void
 }
 
+; FUNC-LABEL: {{^}}commute_umad24:
+; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %src0.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %src2.gep = getelementptr i32 addrspace(1)* %src0.gep, i32 1
+
+  %src0 = load i32 addrspace(1)* %src0.gep, align 4
+  %src2 = load i32 addrspace(1)* %src2.gep, align 4
+  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umax.ll b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
index 1b8da2e..ee854ec 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @vector_umax
-; SI: V_MAX_U32_e32
+; SI-LABEL: {{^}}vector_umax:
+; SI: v_max_u32_e32
 define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
   %load = load i32 addrspace(1)* %in, align 4
@@ -11,8 +11,8 @@ main_body:
   ret void
 }
 
-; SI-LABEL: @scalar_umax
-; SI: S_MAX_U32
+; SI-LABEL: {{^}}scalar_umax:
+; SI: s_max_u32
 define void @scalar_umax(i32 %p0, i32 %p1) #0 {
 entry:
   %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1)
@@ -21,11 +21,11 @@ entry:
   ret void
 }
 
-; SI-LABEL: @trunc_zext_umax
-; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
-; SI: V_MAX_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
-; SI-NOT: AND
-; SI: BUFFER_STORE_SHORT [[RESULT]],
+; SI-LABEL: {{^}}trunc_zext_umax:
+; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
+; SI: v_max_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
+; SI-NOT: and
+; SI: buffer_store_short [[RESULT]],
 define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
   %tmp5 = load i8 addrspace(1)* %src, align 1
   %tmp2 = zext i8 %tmp5 to i32
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umin.ll b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
index 08397f8..2eaa372 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @vector_umin
-; SI: V_MIN_U32_e32
+; SI-LABEL: {{^}}vector_umin:
+; SI: v_min_u32_e32
 define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
   %load = load i32 addrspace(1)* %in, align 4
@@ -11,8 +11,8 @@ main_body:
   ret void
 }
 
-; SI-LABEL: @scalar_umin
-; SI: S_MIN_U32
+; SI-LABEL: {{^}}scalar_umin:
+; SI: s_min_u32
 define void @scalar_umin(i32 %p0, i32 %p1) #0 {
 entry:
   %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1)
@@ -21,11 +21,11 @@ entry:
   ret void
 }
 
-; SI-LABEL: @trunc_zext_umin
-; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
-; SI: V_MIN_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
-; SI-NOT: AND
-; SI: BUFFER_STORE_SHORT [[RESULT]],
+; SI-LABEL: {{^}}trunc_zext_umin:
+; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
+; SI: v_min_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
+; SI-NOT: and
+; SI: buffer_store_short [[RESULT]],
 define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
   %tmp5 = load i8 addrspace(1)* %src, align 1
   %tmp2 = zext i8 %tmp5 to i32
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
index 72a3602..567ac31 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
@@ -6,8 +6,8 @@
 
 declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone
 
-; FUNC-LABEL: @test_umul24
-; SI: V_MUL_U32_U24
+; FUNC-LABEL: {{^}}test_umul24:
+; SI: v_mul_u32_u24
 ; R600: MUL_UINT24
 ; R600: MULLO_UINT
 define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
index 0438ecc..d26bc32 100644
--- a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
+++ b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: S_MOV_B32
-;CHECK-NEXT: V_INTERP_MOV_F32
+;CHECK: s_mov_b32
+;CHECK-NEXT: v_interp_mov_f32
 
 define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
 main_body:
diff --git a/test/CodeGen/R600/llvm.SI.gather4.ll b/test/CodeGen/R600/llvm.SI.gather4.ll
index 8402faa..91a2012 100644
--- a/test/CodeGen/R600/llvm.SI.gather4.ll
+++ b/test/CodeGen/R600/llvm.SI.gather4.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-LABEL: @gather4_v2
-;CHECK: IMAGE_GATHER4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_v2:
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_v2() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -13,8 +13,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4
-;CHECK: IMAGE_GATHER4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4:
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -26,8 +26,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_cl
-;CHECK: IMAGE_GATHER4_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_cl:
+;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_cl() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -39,8 +39,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_l
-;CHECK: IMAGE_GATHER4_L {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_l:
+;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_l() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -52,8 +52,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_b
-;CHECK: IMAGE_GATHER4_B {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_b:
+;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_b() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -65,8 +65,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_b_cl
-;CHECK: IMAGE_GATHER4_B_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_b_cl:
+;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_b_cl() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -78,8 +78,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_b_cl_v8
-;CHECK: IMAGE_GATHER4_B_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_b_cl_v8:
+;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_b_cl_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -91,8 +91,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_lz_v2
-;CHECK: IMAGE_GATHER4_LZ {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_lz_v2:
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_lz_v2() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -104,8 +104,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_lz
-;CHECK: IMAGE_GATHER4_LZ {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_lz:
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_lz() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -119,8 +119,8 @@ main_body:
 
 
 
-;CHECK-LABEL: @gather4_o
-;CHECK: IMAGE_GATHER4_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_o:
+;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -132,8 +132,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_cl_o
-;CHECK: IMAGE_GATHER4_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_cl_o:
+;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_cl_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -145,8 +145,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_cl_o_v8
-;CHECK: IMAGE_GATHER4_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_cl_o_v8:
+;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_cl_o_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -158,8 +158,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_l_o
-;CHECK: IMAGE_GATHER4_L_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_l_o:
+;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_l_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -171,8 +171,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_l_o_v8
-;CHECK: IMAGE_GATHER4_L_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_l_o_v8:
+;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_l_o_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -184,8 +184,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_b_o
-;CHECK: IMAGE_GATHER4_B_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_b_o:
+;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_b_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -197,8 +197,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_b_o_v8
-;CHECK: IMAGE_GATHER4_B_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_b_o_v8:
+;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_b_o_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -210,8 +210,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_b_cl_o
-;CHECK: IMAGE_GATHER4_B_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_b_cl_o:
+;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_b_cl_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -223,8 +223,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_lz_o
-;CHECK: IMAGE_GATHER4_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_lz_o:
+;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_lz_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -238,8 +238,8 @@ main_body:
 
 
 
-;CHECK-LABEL: @gather4_c
-;CHECK: IMAGE_GATHER4_C {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c:
+;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -251,8 +251,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_cl
-;CHECK: IMAGE_GATHER4_C_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_cl:
+;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_cl() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -264,8 +264,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_cl_v8
-;CHECK: IMAGE_GATHER4_C_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_cl_v8:
+;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_cl_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -277,8 +277,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_l
-;CHECK: IMAGE_GATHER4_C_L {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_l:
+;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_l() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -290,8 +290,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_l_v8
-;CHECK: IMAGE_GATHER4_C_L {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_l_v8:
+;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_l_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -303,8 +303,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_b
-;CHECK: IMAGE_GATHER4_C_B {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_b:
+;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_b() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -316,8 +316,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_b_v8
-;CHECK: IMAGE_GATHER4_C_B {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_b_v8:
+;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_b_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -329,8 +329,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_b_cl
-;CHECK: IMAGE_GATHER4_C_B_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_b_cl:
+;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_b_cl() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -342,8 +342,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_lz
-;CHECK: IMAGE_GATHER4_C_LZ {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_lz:
+;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_lz() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -357,8 +357,8 @@ main_body:
 
 
 
-;CHECK-LABEL: @gather4_c_o
-;CHECK: IMAGE_GATHER4_C_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_o:
+;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -370,8 +370,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_o_v8
-;CHECK: IMAGE_GATHER4_C_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_o_v8:
+;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_o_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -383,8 +383,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_cl_o
-;CHECK: IMAGE_GATHER4_C_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_cl_o:
+;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_cl_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -396,8 +396,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_l_o
-;CHECK: IMAGE_GATHER4_C_L_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_l_o:
+;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_l_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -409,8 +409,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_b_o
-;CHECK: IMAGE_GATHER4_C_B_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_b_o:
+;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_b_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -422,8 +422,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_b_cl_o
-;CHECK: IMAGE_GATHER4_C_B_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_b_cl_o:
+;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_b_cl_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -435,8 +435,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_lz_o
-;CHECK: IMAGE_GATHER4_C_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_lz_o:
+;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_lz_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -448,8 +448,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_lz_o_v8
-;CHECK: IMAGE_GATHER4_C_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_lz_o_v8:
+;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_lz_o_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
diff --git a/test/CodeGen/R600/llvm.SI.getlod.ll b/test/CodeGen/R600/llvm.SI.getlod.ll
index a7a17ec..ec26fe5 100644
--- a/test/CodeGen/R600/llvm.SI.getlod.ll
+++ b/test/CodeGen/R600/llvm.SI.getlod.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-LABEL: @getlod
-;CHECK: IMAGE_GET_LOD {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}getlod:
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @getlod() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -11,8 +11,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @getlod_v2
-;CHECK: IMAGE_GET_LOD {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}getlod_v2:
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @getlod_v2() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -22,8 +22,8 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: @getlod_v4
-;CHECK: IMAGE_GET_LOD {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}getlod_v4:
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @getlod_v4() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
diff --git a/test/CodeGen/R600/llvm.SI.image.ll b/test/CodeGen/R600/llvm.SI.image.ll
new file mode 100644
index 0000000..4eec543
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.image.ll
@@ -0,0 +1,49 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}image_load:
+;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @image_load() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}image_load_mip:
+;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @image_load_mip() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}getresinfo:
+;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getresinfo() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.SI.image.sample.ll b/test/CodeGen/R600/llvm.SI.image.sample.ll
new file mode 100644
index 0000000..ebff391
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.image.sample.ll
@@ -0,0 +1,289 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}sample:
+;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cl:
+;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d:
+;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d_cl:
+;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_l:
+;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b:
+;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b_cl:
+;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_lz:
+;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd:
+;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd_cl:
+;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c:
+;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cl:
+;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d:
+;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d_cl:
+;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_l:
+;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b:
+;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b_cl:
+;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_lz:
+;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd:
+;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd_cl:
+;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.SI.image.sample.o.ll b/test/CodeGen/R600/llvm.SI.image.sample.o.ll
new file mode 100644
index 0000000..dbc1b2b
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.image.sample.o.ll
@@ -0,0 +1,289 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}sample:
+;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cl:
+;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d:
+;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d_cl:
+;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_l:
+;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b:
+;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b_cl:
+;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_lz:
+;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd:
+;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd_cl:
+;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c:
+;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cl:
+;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d:
+;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d_cl:
+;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_l:
+;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b:
+;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b_cl:
+;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_lz:
+;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd:
+;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd_cl:
+;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/llvm.SI.imageload.ll b/test/CodeGen/R600/llvm.SI.imageload.ll
index 59e00f0..673d92d 100644
--- a/test/CodeGen/R600/llvm.SI.imageload.ll
+++ b/test/CodeGen/R600/llvm.SI.imageload.ll
@@ -1,15 +1,15 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-DAG: IMAGE_LOAD {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
-;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 2, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 1, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 4, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 8, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
-;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 8, 0, 0, -1
+;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 2, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 1, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 4, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, -1
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
@@ -84,7 +84,7 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
 
 ; Test that ccordinates are stored in vgprs and not sgprs
 ; CHECK: vgpr_coords
-; CHECK: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}
 define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr float addrspace(2)* addrspace(2)* %0, i32 0
diff --git a/test/CodeGen/R600/llvm.SI.load.dword.ll b/test/CodeGen/R600/llvm.SI.load.dword.ll
index a622775..e5c6201 100644
--- a/test/CodeGen/R600/llvm.SI.load.dword.ll
+++ b/test/CodeGen/R600/llvm.SI.load.dword.ll
@@ -3,11 +3,11 @@
 ; Example of a simple geometry shader loading vertex attributes from the
 ; ESGS ring buffer
 
-; CHECK-LABEL: @main
-; CHECK: BUFFER_LOAD_DWORD
-; CHECK: BUFFER_LOAD_DWORD
-; CHECK: BUFFER_LOAD_DWORD
-; CHECK: BUFFER_LOAD_DWORD
+; CHECK-LABEL: {{^}}main:
+; CHECK: buffer_load_dword
+; CHECK: buffer_load_dword
+; CHECK: buffer_load_dword
+; CHECK: buffer_load_dword
 
 define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, [2 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* inreg, [17 x <16 x i8>] addrspace(2)* inreg, i32, i32, i32, i32) #0 {
 main_body:
diff --git a/test/CodeGen/R600/llvm.SI.resinfo.ll b/test/CodeGen/R600/llvm.SI.resinfo.ll
index af3afc1..d8f3722 100644
--- a/test/CodeGen/R600/llvm.SI.resinfo.ll
+++ b/test/CodeGen/R600/llvm.SI.resinfo.ll
@@ -1,21 +1,21 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
 
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 2, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 1, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 4, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 8, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 8, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 2, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 1, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 4, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, -1
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8,
 		  i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) {
diff --git a/test/CodeGen/R600/llvm.SI.sample-masked.ll b/test/CodeGen/R600/llvm.SI.sample-masked.ll
index 445359a..9e86bec 100644
--- a/test/CodeGen/R600/llvm.SI.sample-masked.ll
+++ b/test/CodeGen/R600/llvm.SI.sample-masked.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
 
-; CHECK-LABEL: @v1
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 13
+; CHECK-LABEL: {{^}}v1:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13
 define void @v1(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
@@ -13,8 +13,8 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @v2
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 11
+; CHECK-LABEL: {{^}}v2:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11
 define void @v2(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
@@ -26,8 +26,8 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @v3
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 14
+; CHECK-LABEL: {{^}}v3:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
 define void @v3(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
@@ -39,8 +39,8 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @v4
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 7
+; CHECK-LABEL: {{^}}v4:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7
 define void @v4(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
@@ -52,8 +52,8 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @v5
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 10
+; CHECK-LABEL: {{^}}v5:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
 define void @v5(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
@@ -64,8 +64,8 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @v6
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 6
+; CHECK-LABEL: {{^}}v6:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6
 define void @v6(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
@@ -76,8 +76,8 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @v7
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 9
+; CHECK-LABEL: {{^}}v7:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9
 define void @v7(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll
index 24e8f64..a1d2c02 100644
--- a/test/CodeGen/R600/llvm.SI.sample.ll
+++ b/test/CodeGen/R600/llvm.SI.sample.ll
@@ -1,21 +1,21 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 15
-;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 3
-;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 2
-;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 1
-;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 4
-;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 8
-;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 5
-;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 9
-;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 6
-;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 10
-;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 12
-;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 7
-;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 11
-;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 13
-;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 14
-;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 8
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3
+;CHECK-DAG: image_sample {{v[0-9]+}}, 2
+;CHECK-DAG: image_sample {{v[0-9]+}}, 1
+;CHECK-DAG: image_sample {{v[0-9]+}}, 4
+;CHECK-DAG: image_sample {{v[0-9]+}}, 8
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 5
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 9
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 6
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 12
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 7
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 11
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 13
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
+;CHECK-DAG: image_sample {{v[0-9]+}}, 8
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
@@ -135,8 +135,8 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
    ret void
 }
 
-; CHECK: @v1
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 15
+; CHECK: {{^}}v1:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
 define void @v1(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll
index 366456f..91b71f3 100644
--- a/test/CodeGen/R600/llvm.SI.sampled.ll
+++ b/test/CodeGen/R600/llvm.SI.sampled.ll
@@ -1,21 +1,21 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 15
-;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 3
-;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 2
-;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 1
-;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 4
-;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 8
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 5
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 9
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 6
-;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 10
-;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 12
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 7
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 11
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 13
-;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 14
-;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 8
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 2
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 1
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 4
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 5
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 9
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 6
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 10
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 12
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 7
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 11
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 13
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 14
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
diff --git a/test/CodeGen/R600/llvm.SI.sendmsg.ll b/test/CodeGen/R600/llvm.SI.sendmsg.ll
index 581d422..042fc5b 100644
--- a/test/CodeGen/R600/llvm.SI.sendmsg.ll
+++ b/test/CodeGen/R600/llvm.SI.sendmsg.ll
@@ -1,10 +1,10 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-; CHECK-LABEL: @main
-; CHECK: S_SENDMSG Gs(emit stream 0)
-; CHECK: S_SENDMSG Gs(cut stream 1)
-; CHECK: S_SENDMSG Gs(emit-cut stream 2)
-; CHECK: S_SENDMSG Gs_done(nop)
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_sendmsg Gs(emit stream 0)
+; CHECK: s_sendmsg Gs(cut stream 1)
+; CHECK: s_sendmsg Gs(emit-cut stream 2)
+; CHECK: s_sendmsg Gs_done(nop)
 
 define void @main() {
 main_body:
diff --git a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
index 740581a..702daea 100644
--- a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
+++ b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-LABEL: @test1
-;CHECK: TBUFFER_STORE_FORMAT_XYZW {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK-LABEL: {{^}}test1:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test1(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -10,8 +10,8 @@ define void @test1(i32 %a1, i32 %vaddr) #0 {
     ret void
 }
 
-;CHECK-LABEL: @test2
-;CHECK: TBUFFER_STORE_FORMAT_XYZ {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK-LABEL: {{^}}test2:
+;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test2(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -20,8 +20,8 @@ define void @test2(i32 %a1, i32 %vaddr) #0 {
     ret void
 }
 
-;CHECK-LABEL: @test3
-;CHECK: TBUFFER_STORE_FORMAT_XY {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK-LABEL: {{^}}test3:
+;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test3(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
@@ -30,8 +30,8 @@ define void @test3(i32 %a1, i32 %vaddr) #0 {
     ret void
 }
 
-;CHECK-LABEL: @test4
-;CHECK: TBUFFER_STORE_FORMAT_X {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK-LABEL: {{^}}test4:
+;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test4(i32 %vdata, i32 %vaddr) #0 {
     call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
         i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
diff --git a/test/CodeGen/R600/llvm.SI.tid.ll b/test/CodeGen/R600/llvm.SI.tid.ll
index fe17304..ee96124 100644
--- a/test/CodeGen/R600/llvm.SI.tid.ll
+++ b/test/CodeGen/R600/llvm.SI.tid.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_MBCNT_LO_U32_B32_e64
-;CHECK: V_MBCNT_HI_U32_B32_e32
+;CHECK: v_mbcnt_lo_u32_b32_e64
+;CHECK: v_mbcnt_hi_u32_b32_e32
 
 define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
 main_body:
diff --git a/test/CodeGen/R600/llvm.amdgpu.dp4.ll b/test/CodeGen/R600/llvm.amdgpu.dp4.ll
new file mode 100644
index 0000000..812b6a4
--- /dev/null
+++ b/test/CodeGen/R600/llvm.amdgpu.dp4.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone
+
+define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
+  %src0 = load <4 x float> addrspace(1)* %a, align 16
+  %src1 = load <4 x float> addrspace(1)* %b, align 16
+  %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
+  store float %dp4, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.amdgpu.kilp.ll b/test/CodeGen/R600/llvm.amdgpu.kilp.ll
new file mode 100644
index 0000000..08bee38
--- /dev/null
+++ b/test/CodeGen/R600/llvm.amdgpu.kilp.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}kilp_gs_const:
+; SI: s_mov_b64 exec, 0
+define void @kilp_gs_const() #0 {
+main_body:
+  %0 = icmp ule i32 0, 3
+  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kilp(float %1)
+  %2 = icmp ule i32 3, 0
+  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kilp(float %3)
+  ret void
+}
+
+declare void @llvm.AMDGPU.kilp(float)
+
+attributes #0 = { "ShaderType"="2" }
+
+!0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/llvm.amdgpu.lrp.ll b/test/CodeGen/R600/llvm.amdgpu.lrp.ll
new file mode 100644
index 0000000..ee922fe
--- /dev/null
+++ b/test/CodeGen/R600/llvm.amdgpu.lrp.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_lrp:
+; SI: v_sub_f32
+; SI: v_mad_f32
+define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind {
+  %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone
+  store float %mad, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
index 9e7a4de..837340f 100644
--- a/test/CodeGen/R600/llvm.cos.ll
+++ b/test/CodeGen/R600/llvm.cos.ll
@@ -7,8 +7,8 @@
 ;EG: ADD *
 ;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ;EG-NOT: COS
-;SI: V_COS_F32
-;SI-NOT: V_COS_F32
+;SI: v_cos_f32
+;SI-NOT: v_cos_f32
 
 define void @test(float addrspace(1)* %out, float %x) #1 {
    %cos = call float @llvm.cos.f32(float %x)
@@ -22,11 +22,11 @@ define void @test(float addrspace(1)* %out, float %x) #1 {
 ;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ;EG-NOT: COS
-;SI: V_COS_F32
-;SI: V_COS_F32
-;SI: V_COS_F32
-;SI: V_COS_F32
-;SI-NOT: V_COS_F32
+;SI: v_cos_f32
+;SI: v_cos_f32
+;SI: v_cos_f32
+;SI: v_cos_f32
+;SI-NOT: v_cos_f32
 
 define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
    %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx)
diff --git a/test/CodeGen/R600/llvm.exp2.ll b/test/CodeGen/R600/llvm.exp2.ll
index 119d5ef..52dc67d 100644
--- a/test/CodeGen/R600/llvm.exp2.ll
+++ b/test/CodeGen/R600/llvm.exp2.ll
@@ -2,13 +2,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK --check-prefix=FUNC
 ;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 
-;FUNC-LABEL: @test
+;FUNC-LABEL: {{^}}test:
 ;EG-CHECK: EXP_IEEE
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: V_EXP_F32
+;SI-CHECK: v_exp_f32
 
 define void @test(float addrspace(1)* %out, float %in) {
 entry:
@@ -17,7 +17,7 @@ entry:
    ret void
 }
 
-;FUNC-LABEL: @testv2
+;FUNC-LABEL: {{^}}testv2:
 ;EG-CHECK: EXP_IEEE
 ;EG-CHECK: EXP_IEEE
 ; FIXME: We should be able to merge these packets together on Cayman so we
@@ -30,8 +30,8 @@ entry:
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: V_EXP_F32
-;SI-CHECK: V_EXP_F32
+;SI-CHECK: v_exp_f32
+;SI-CHECK: v_exp_f32
 
 define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
@@ -40,7 +40,7 @@ entry:
   ret void
 }
 
-;FUNC-LABEL: @testv4
+;FUNC-LABEL: {{^}}testv4:
 ;EG-CHECK: EXP_IEEE
 ;EG-CHECK: EXP_IEEE
 ;EG-CHECK: EXP_IEEE
@@ -63,10 +63,10 @@ entry:
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: V_EXP_F32
-;SI-CHECK: V_EXP_F32
-;SI-CHECK: V_EXP_F32
-;SI-CHECK: V_EXP_F32
+;SI-CHECK: v_exp_f32
+;SI-CHECK: v_exp_f32
+;SI-CHECK: v_exp_f32
+;SI-CHECK: v_exp_f32
 define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/R600/llvm.floor.ll b/test/CodeGen/R600/llvm.floor.ll
index f7071cd..0c7a15b 100644
--- a/test/CodeGen/R600/llvm.floor.ll
+++ b/test/CodeGen/R600/llvm.floor.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; R600-CHECK: @f32
+; R600-CHECK: {{^}}f32:
 ; R600-CHECK: FLOOR
-; SI-CHECK: @f32
-; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: {{^}}f32:
+; SI-CHECK: v_floor_f32_e32
 define void @f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.floor.f32(float %in)
@@ -12,12 +12,12 @@ entry:
   ret void
 }
 
-; R600-CHECK: @v2f32
+; R600-CHECK: {{^}}v2f32:
 ; R600-CHECK: FLOOR
 ; R600-CHECK: FLOOR
-; SI-CHECK: @v2f32
-; SI-CHECK: V_FLOOR_F32_e32
-; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: {{^}}v2f32:
+; SI-CHECK: v_floor_f32_e32
+; SI-CHECK: v_floor_f32_e32
 define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.floor.v2f32(<2 x float> %in)
@@ -25,16 +25,16 @@ entry:
   ret void
 }
 
-; R600-CHECK: @v4f32
+; R600-CHECK: {{^}}v4f32:
 ; R600-CHECK: FLOOR
 ; R600-CHECK: FLOOR
 ; R600-CHECK: FLOOR
 ; R600-CHECK: FLOOR
-; SI-CHECK: @v4f32
-; SI-CHECK: V_FLOOR_F32_e32
-; SI-CHECK: V_FLOOR_F32_e32
-; SI-CHECK: V_FLOOR_F32_e32
-; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: {{^}}v4f32:
+; SI-CHECK: v_floor_f32_e32
+; SI-CHECK: v_floor_f32_e32
+; SI-CHECK: v_floor_f32_e32
+; SI-CHECK: v_floor_f32_e32
 define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.floor.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/R600/llvm.log2.ll b/test/CodeGen/R600/llvm.log2.ll
index 4cba2d4..0b54a46 100644
--- a/test/CodeGen/R600/llvm.log2.ll
+++ b/test/CodeGen/R600/llvm.log2.ll
@@ -2,13 +2,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK --check-prefix=FUNC
 ;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 
-;FUNC-LABEL: @test
+;FUNC-LABEL: {{^}}test:
 ;EG-CHECK: LOG_IEEE
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: V_LOG_F32
+;SI-CHECK: v_log_f32
 
 define void @test(float addrspace(1)* %out, float %in) {
 entry:
@@ -17,7 +17,7 @@ entry:
    ret void
 }
 
-;FUNC-LABEL: @testv2
+;FUNC-LABEL: {{^}}testv2:
 ;EG-CHECK: LOG_IEEE
 ;EG-CHECK: LOG_IEEE
 ; FIXME: We should be able to merge these packets together on Cayman so we
@@ -30,8 +30,8 @@ entry:
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: V_LOG_F32
-;SI-CHECK: V_LOG_F32
+;SI-CHECK: v_log_f32
+;SI-CHECK: v_log_f32
 
 define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
@@ -40,7 +40,7 @@ entry:
   ret void
 }
 
-;FUNC-LABEL: @testv4
+;FUNC-LABEL: {{^}}testv4:
 ;EG-CHECK: LOG_IEEE
 ;EG-CHECK: LOG_IEEE
 ;EG-CHECK: LOG_IEEE
@@ -63,10 +63,10 @@ entry:
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: V_LOG_F32
-;SI-CHECK: V_LOG_F32
-;SI-CHECK: V_LOG_F32
-;SI-CHECK: V_LOG_F32
+;SI-CHECK: v_log_f32
+;SI-CHECK: v_log_f32
+;SI-CHECK: v_log_f32
+;SI-CHECK: v_log_f32
 define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/R600/llvm.memcpy.ll b/test/CodeGen/R600/llvm.memcpy.ll
new file mode 100644
index 0000000..5f2710a
--- /dev/null
+++ b/test/CodeGen/R600/llvm.memcpy.ll
@@ -0,0 +1,364 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
+
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2:
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4:
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind
+  ret void
+}
+
+; FIXME: Use 64-bit ops
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1:
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2:
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4:
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8:
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16:
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.rint.f64.ll b/test/CodeGen/R600/llvm.rint.f64.ll
index 3e2884b..72b546e 100644
--- a/test/CodeGen/R600/llvm.rint.f64.ll
+++ b/test/CodeGen/R600/llvm.rint.f64.ll
@@ -1,15 +1,15 @@
 ; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @rint_f64
-; CI: V_RNDNE_F64_e32
+; FUNC-LABEL: {{^}}rint_f64:
+; CI: v_rndne_f64_e32
 
-; SI-DAG: V_ADD_F64
-; SI-DAG: V_ADD_F64
-; SI-DAG V_CMP_GT_F64_e64
-; SI: V_CNDMASK_B32
-; SI: V_CNDMASK_B32
-; SI: S_ENDPGM
+; SI-DAG: v_add_f64
+; SI-DAG: v_add_f64
+; SI-DAG v_cmp_gt_f64_e64
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
+; SI: s_endpgm
 define void @rint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.rint.f64(double %in)
@@ -17,9 +17,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @rint_v2f64
-; CI: V_RNDNE_F64_e32
-; CI: V_RNDNE_F64_e32
+; FUNC-LABEL: {{^}}rint_v2f64:
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
 define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
@@ -27,11 +27,11 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @rint_v4f64
-; CI: V_RNDNE_F64_e32
-; CI: V_RNDNE_F64_e32
-; CI: V_RNDNE_F64_e32
-; CI: V_RNDNE_F64_e32
+; FUNC-LABEL: {{^}}rint_v4f64:
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
 define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)
diff --git a/test/CodeGen/R600/llvm.rint.ll b/test/CodeGen/R600/llvm.rint.ll
index 209bb43..2e05964 100644
--- a/test/CodeGen/R600/llvm.rint.ll
+++ b/test/CodeGen/R600/llvm.rint.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @rint_f32
+; FUNC-LABEL: {{^}}rint_f32:
 ; R600: RNDNE
 
-; SI: V_RNDNE_F32_e32
+; SI: v_rndne_f32_e32
 define void @rint_f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.rint.f32(float %in) #0
@@ -12,12 +12,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @rint_v2f32
+; FUNC-LABEL: {{^}}rint_v2f32:
 ; R600: RNDNE
 ; R600: RNDNE
 
-; SI: V_RNDNE_F32_e32
-; SI: V_RNDNE_F32_e32
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
 define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0
@@ -25,16 +25,16 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @rint_v4f32
+; FUNC-LABEL: {{^}}rint_v4f32:
 ; R600: RNDNE
 ; R600: RNDNE
 ; R600: RNDNE
 ; R600: RNDNE
 
-; SI: V_RNDNE_F32_e32
-; SI: V_RNDNE_F32_e32
-; SI: V_RNDNE_F32_e32
-; SI: V_RNDNE_F32_e32
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
 define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0
@@ -42,10 +42,10 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @legacy_amdil_round_nearest_f32
+; FUNC-LABEL: {{^}}legacy_amdil_round_nearest_f32:
 ; R600: RNDNE
 
-; SI: V_RNDNE_F32_e32
+; SI: v_rndne_f32_e32
 define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0
diff --git a/test/CodeGen/R600/llvm.round.ll b/test/CodeGen/R600/llvm.round.ll
index e06d45d..bedf4ba 100644
--- a/test/CodeGen/R600/llvm.round.ll
+++ b/test/CodeGen/R600/llvm.round.ll
@@ -1,11 +1,13 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
 
-; FUNC-LABEL: @f32
-; R600: FRACT
-; R600-DAG: ADD
-; R600-DAG: CEIL
-; R600-DAG: FLOOR
-; R600: CNDGE
+; FUNC-LABEL: {{^}}f32:
+; R600: FRACT {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
+; R600-DAG: ADD  {{.*}}, -0.5
+; R600-DAG: CEIL {{.*}} [[ARG]]
+; R600-DAG: FLOOR {{.*}} [[ARG]]
+; R600-DAG: CNDGE
+; R600-DAG: CNDGT
+; R600: CNDGE {{[^,]+}}, [[ARG]]
 define void @f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.round.f32(float %in)
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
index 41c363c..7e45710 100644
--- a/test/CodeGen/R600/llvm.sin.ll
+++ b/test/CodeGen/R600/llvm.sin.ll
@@ -1,35 +1,84 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-
-;FUNC-LABEL: test
-;EG: MULADD_IEEE *
-;EG: FRACT *
-;EG: ADD *
-;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG-NOT: SIN
-;SI: V_MUL_F32
-;SI: V_SIN_F32
-;SI-NOT: V_SIN_F32
-
-define void @test(float addrspace(1)* %out, float %x) #1 {
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+
+; FUNC-LABEL: sin_f32
+; EG: MULADD_IEEE *
+; EG: FRACT *
+; EG: ADD *
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG-NOT: SIN
+; SI: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+
+define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
    %sin = call float @llvm.sin.f32(float %x)
    store float %sin, float addrspace(1)* %out
    ret void
 }
 
-;FUNC-LABEL: testv
-;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG-NOT: SIN
-;SI: V_SIN_F32
-;SI: V_SIN_F32
-;SI: V_SIN_F32
-;SI: V_SIN_F32
-;SI-NOT: V_SIN_F32
-
-define void @testv(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
+; FUNC-LABEL: {{^}}sin_3x_f32:
+; SI-UNSAFE-NOT: v_add_f32
+; SI-UNSAFE: 0x3ef47644
+; SI-UNSAFE: v_mul_f32
+; SI-SAFE: v_mul_f32
+; SI-SAFE: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul float 3.0, %x
+  %sin = call float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sin_2x_f32:
+; SI-UNSAFE-NOT: v_add_f32
+; SI-UNSAFE: 0x3ea2f983
+; SI-UNSAFE: v_mul_f32
+; SI-SAFE: v_add_f32
+; SI-SAFE: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul float 2.0, %x
+  %sin = call float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_2sin_f32:
+; SI-UNSAFE: 0x3ea2f983
+; SI-UNSAFE: v_mul_f32
+; SI-SAFE: v_add_f32
+; SI-SAFE: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 {
+   %y = fmul float 2.0, %x
+   %sin = call float @llvm.sin.f32(float %y)
+   store float %sin, float addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}sin_v4f32:
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG-NOT: SIN
+; SI: v_sin_f32
+; SI: v_sin_f32
+; SI: v_sin_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+
+define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
    %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
    store <4 x float> %sin, <4 x float> addrspace(1)* %out
    ret void
diff --git a/test/CodeGen/R600/llvm.sqrt.ll b/test/CodeGen/R600/llvm.sqrt.ll
index 4eee37f..c039225 100644
--- a/test/CodeGen/R600/llvm.sqrt.ll
+++ b/test/CodeGen/R600/llvm.sqrt.ll
@@ -1,11 +1,11 @@
 ; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
 ; RUN: llc < %s -march=r600 --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI-CHECK
 
-; R600-CHECK-LABEL: @sqrt_f32
+; R600-CHECK-LABEL: {{^}}sqrt_f32:
 ; R600-CHECK: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; R600-CHECK: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
-; SI-CHECK-LABEL: @sqrt_f32
-; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK-LABEL: {{^}}sqrt_f32:
+; SI-CHECK: v_sqrt_f32_e32
 define void @sqrt_f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.sqrt.f32(float %in)
@@ -13,14 +13,14 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @sqrt_v2f32
+; R600-CHECK-LABEL: {{^}}sqrt_v2f32:
 ; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
 ; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
 ; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
-; SI-CHECK-LABEL: @sqrt_v2f32
-; SI-CHECK: V_SQRT_F32_e32
-; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK-LABEL: {{^}}sqrt_v2f32:
+; SI-CHECK: v_sqrt_f32_e32
+; SI-CHECK: v_sqrt_f32_e32
 define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
@@ -28,7 +28,7 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @sqrt_v4f32
+; R600-CHECK-LABEL: {{^}}sqrt_v4f32:
 ; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
 ; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
@@ -37,11 +37,11 @@ entry:
 ; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
 ; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
 ; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
-; SI-CHECK-LABEL: @sqrt_v4f32
-; SI-CHECK: V_SQRT_F32_e32
-; SI-CHECK: V_SQRT_F32_e32
-; SI-CHECK: V_SQRT_F32_e32
-; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK-LABEL: {{^}}sqrt_v4f32:
+; SI-CHECK: v_sqrt_f32_e32
+; SI-CHECK: v_sqrt_f32_e32
+; SI-CHECK: v_sqrt_f32_e32
+; SI-CHECK: v_sqrt_f32_e32
 define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/R600/llvm.trunc.ll b/test/CodeGen/R600/llvm.trunc.ll
index fa6fb99..5585477 100644
--- a/test/CodeGen/R600/llvm.trunc.ll
+++ b/test/CodeGen/R600/llvm.trunc.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK-LABEL: @trunc_f32
+; CHECK-LABEL: {{^}}trunc_f32:
 ; CHECK: TRUNC
 
 define void @trunc_f32(float addrspace(1)* %out, float %in) {
diff --git a/test/CodeGen/R600/load-i1.ll b/test/CodeGen/R600/load-i1.ll
index 9ba81b8..d85e16f 100644
--- a/test/CodeGen/R600/load-i1.ll
+++ b/test/CodeGen/R600/load-i1.ll
@@ -1,21 +1,21 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
-; SI-LABEL: @global_copy_i1_to_i1
-; SI: BUFFER_LOAD_UBYTE
-; SI: V_AND_B32_e32 v{{[0-9]+}}, 1
-; SI: BUFFER_STORE_BYTE
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}global_copy_i1_to_i1:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1
+; SI: buffer_store_byte
+; SI: s_endpgm
 define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   store i1 %load, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: @global_sextload_i1_to_i32
+; SI-LABEL: {{^}}global_sextload_i1_to_i32:
 ; XSI: BUFFER_LOAD_BYTE
-; SI: BUFFER_STORE_DWORD
-; SI: S_ENDPGM
+; SI: buffer_store_dword
+; SI: s_endpgm
 define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
@@ -23,10 +23,10 @@ define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
   ret void
 }
 
-; SI-LABEL: @global_zextload_i1_to_i32
-; SI: BUFFER_LOAD_UBYTE
-; SI: BUFFER_STORE_DWORD
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}global_zextload_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
 define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
@@ -34,10 +34,10 @@ define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
   ret void
 }
 
-; SI-LABEL: @global_sextload_i1_to_i64
+; SI-LABEL: {{^}}global_sextload_i1_to_i64:
 ; XSI: BUFFER_LOAD_BYTE
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
 define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   %ext = sext i1 %load to i64
@@ -45,10 +45,10 @@ define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
   ret void
 }
 
-; SI-LABEL: @global_zextload_i1_to_i64
-; SI: BUFFER_LOAD_UBYTE
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}global_zextload_i1_to_i64:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
 define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   %ext = zext i1 %load to i64
@@ -56,50 +56,50 @@ define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
   ret void
 }
 
-; SI-LABEL: @i1_arg
-; SI: BUFFER_LOAD_UBYTE
-; SI: V_AND_B32_e32
-; SI: BUFFER_STORE_BYTE
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}i1_arg:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32
+; SI: buffer_store_byte
+; SI: s_endpgm
 define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
   store i1 %x, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: @i1_arg_zext_i32
-; SI: BUFFER_LOAD_UBYTE
-; SI: BUFFER_STORE_DWORD
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}i1_arg_zext_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
 define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: @i1_arg_zext_i64
-; SI: BUFFER_LOAD_UBYTE
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}i1_arg_zext_i64:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
 define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @i1_arg_sext_i32
+; SI-LABEL: {{^}}i1_arg_sext_i32:
 ; XSI: BUFFER_LOAD_BYTE
-; SI: BUFFER_STORE_DWORD
-; SI: S_ENDPGM
+; SI: buffer_store_dword
+; SI: s_endpgm
 define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i32
   store i32 %ext, i32addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: @i1_arg_sext_i64
+; SI-LABEL: {{^}}i1_arg_sext_i64:
 ; XSI: BUFFER_LOAD_BYTE
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
 define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/load-input-fold.ll b/test/CodeGen/R600/load-input-fold.ll
index ca86d0e..265fa9b 100644
--- a/test/CodeGen/R600/load-input-fold.ll
+++ b/test/CodeGen/R600/load-input-fold.ll
@@ -1,5 +1,4 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman
-;REQUIRES: asserts
 
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
 main_body:
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index a57df5c..62d3063 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -7,10 +7,10 @@
 ;===------------------------------------------------------------------------===;
 
 ; Load an i8 value from the global address space.
-; FUNC-LABEL: @load_i8
+; FUNC-LABEL: {{^}}load_i8:
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
 
-; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
+; SI-CHECK: buffer_load_ubyte v{{[0-9]+}},
 define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %1 = load i8 addrspace(1)* %in
   %2 = zext i8 %1 to i32
@@ -18,13 +18,13 @@ define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   ret void
 }
 
-; FUNC-LABEL: @load_i8_sext
+; FUNC-LABEL: {{^}}load_i8_sext:
 ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 24
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 24
-; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: buffer_load_sbyte
 define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = load i8 addrspace(1)* %in
@@ -33,11 +33,11 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v2i8
+; FUNC-LABEL: {{^}}load_v2i8:
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
 define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i8> addrspace(1)* %in
@@ -46,7 +46,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v2i8_sext
+; FUNC-LABEL: {{^}}load_v2i8_sext:
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
@@ -57,8 +57,8 @@ entry:
 ; R600-CHECK-DAG: 24
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
 ; R600-CHECK-DAG: 24
-; SI-CHECK: BUFFER_LOAD_SBYTE
-; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: buffer_load_sbyte
+; SI-CHECK: buffer_load_sbyte
 define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i8> addrspace(1)* %in
@@ -67,15 +67,15 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v4i8
+; FUNC-LABEL: {{^}}load_v4i8:
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
 define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i8> addrspace(1)* %in
@@ -84,7 +84,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v4i8_sext
+; FUNC-LABEL: {{^}}load_v4i8_sext:
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
@@ -105,10 +105,10 @@ entry:
 ; R600-CHECK-DAG: 24
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
 ; R600-CHECK-DAG: 24
-; SI-CHECK: BUFFER_LOAD_SBYTE
-; SI-CHECK: BUFFER_LOAD_SBYTE
-; SI-CHECK: BUFFER_LOAD_SBYTE
-; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: buffer_load_sbyte
+; SI-CHECK: buffer_load_sbyte
+; SI-CHECK: buffer_load_sbyte
+; SI-CHECK: buffer_load_sbyte
 define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i8> addrspace(1)* %in
@@ -118,9 +118,9 @@ entry:
 }
 
 ; Load an i16 value from the global address space.
-; FUNC-LABEL: @load_i16
+; FUNC-LABEL: {{^}}load_i16:
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: buffer_load_ushort
 define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %0 = load i16	 addrspace(1)* %in
@@ -129,13 +129,13 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_i16_sext
+; FUNC-LABEL: {{^}}load_i16_sext:
 ; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 16
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 16
-; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: buffer_load_sshort
 define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %0 = load i16 addrspace(1)* %in
@@ -144,11 +144,11 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v2i16
+; FUNC-LABEL: {{^}}load_v2i16:
 ; R600-CHECK: VTX_READ_16
 ; R600-CHECK: VTX_READ_16
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
 define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i16> addrspace(1)* %in
@@ -157,7 +157,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v2i16_sext
+; FUNC-LABEL: {{^}}load_v2i16_sext:
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
@@ -168,8 +168,8 @@ entry:
 ; R600-CHECK-DAG: 16
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
 ; R600-CHECK-DAG: 16
-; SI-CHECK: BUFFER_LOAD_SSHORT
-; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: buffer_load_sshort
+; SI-CHECK: buffer_load_sshort
 define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i16> addrspace(1)* %in
@@ -178,15 +178,15 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v4i16
+; FUNC-LABEL: {{^}}load_v4i16:
 ; R600-CHECK: VTX_READ_16
 ; R600-CHECK: VTX_READ_16
 ; R600-CHECK: VTX_READ_16
 ; R600-CHECK: VTX_READ_16
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
 define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i16> addrspace(1)* %in
@@ -195,7 +195,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v4i16_sext
+; FUNC-LABEL: {{^}}load_v4i16_sext:
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
@@ -216,10 +216,10 @@ entry:
 ; R600-CHECK-DAG: 16
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
 ; R600-CHECK-DAG: 16
-; SI-CHECK: BUFFER_LOAD_SSHORT
-; SI-CHECK: BUFFER_LOAD_SSHORT
-; SI-CHECK: BUFFER_LOAD_SSHORT
-; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: buffer_load_sshort
+; SI-CHECK: buffer_load_sshort
+; SI-CHECK: buffer_load_sshort
+; SI-CHECK: buffer_load_sshort
 define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i16> addrspace(1)* %in
@@ -229,10 +229,10 @@ entry:
 }
 
 ; load an i32 value from the global address space.
-; FUNC-LABEL: @load_i32
+; FUNC-LABEL: {{^}}load_i32:
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: BUFFER_LOAD_DWORD v{{[0-9]+}}
+; SI-CHECK: buffer_load_dword v{{[0-9]+}}
 define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32 addrspace(1)* %in
@@ -241,10 +241,10 @@ entry:
 }
 
 ; load a f32 value from the global address space.
-; FUNC-LABEL: @load_f32
+; FUNC-LABEL: {{^}}load_f32:
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: BUFFER_LOAD_DWORD v{{[0-9]+}}
+; SI-CHECK: buffer_load_dword v{{[0-9]+}}
 define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float addrspace(1)* %in
@@ -253,10 +253,10 @@ entry:
 }
 
 ; load a v2f32 value from the global address space
-; FUNC-LABEL: @load_v2f32
+; FUNC-LABEL: {{^}}load_v2f32:
+; R600-CHECK: MEM_RAT
 ; R600-CHECK: VTX_READ_64
-
-; SI-CHECK: BUFFER_LOAD_DWORDX2
+; SI-CHECK: buffer_load_dwordx2
 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 entry:
   %0 = load <2 x float> addrspace(1)* %in
@@ -264,11 +264,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_i64
-; R600-CHECK: MEM_RAT
-; R600-CHECK: MEM_RAT
-
-; SI-CHECK: BUFFER_LOAD_DWORDX2
+; FUNC-LABEL: {{^}}load_i64:
+; R600-CHECK: VTX_READ_64
+; SI-CHECK: buffer_load_dwordx2
 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64 addrspace(1)* %in
@@ -276,12 +274,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_i64_sext
+; FUNC-LABEL: {{^}}load_i64_sext:
 ; R600-CHECK: MEM_RAT
 ; R600-CHECK: MEM_RAT
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
 ; R600-CHECK: 31
-; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: buffer_load_dword
 
 define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
@@ -291,7 +289,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_i64_zext
+; FUNC-LABEL: {{^}}load_i64_zext:
 ; R600-CHECK: MEM_RAT
 ; R600-CHECK: MEM_RAT
 define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -302,18 +300,18 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v8i32
+; FUNC-LABEL: {{^}}load_v8i32:
 ; R600-CHECK: VTX_READ_128
 ; R600-CHECK: VTX_READ_128
 ; XXX: We should be using DWORDX4 instructions on SI.
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
 define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
 entry:
   %0 = load <8 x i32> addrspace(1)* %in
@@ -321,28 +319,28 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v16i32
+; FUNC-LABEL: {{^}}load_v16i32:
 ; R600-CHECK: VTX_READ_128
 ; R600-CHECK: VTX_READ_128
 ; R600-CHECK: VTX_READ_128
 ; R600-CHECK: VTX_READ_128
 ; XXX: We should be using DWORDX4 instructions on SI.
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
 define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
 entry:
   %0 = load <16 x i32> addrspace(1)* %in
@@ -355,13 +353,13 @@ entry:
 ;===------------------------------------------------------------------------===;
 
 ; Load a sign-extended i8 value
-; FUNC-LABEL: @load_const_i8_sext
+; FUNC-LABEL: {{^}}load_const_i8_sext:
 ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 24
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 24
-; SI-CHECK: BUFFER_LOAD_SBYTE v{{[0-9]+}},
+; SI-CHECK: buffer_load_sbyte v{{[0-9]+}},
 define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = load i8 addrspace(2)* %in
@@ -371,9 +369,9 @@ entry:
 }
 
 ; Load an aligned i8 value
-; FUNC-LABEL: @load_const_i8_aligned
+; FUNC-LABEL: {{^}}load_const_i8_aligned:
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
+; SI-CHECK: buffer_load_ubyte v{{[0-9]+}},
 define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = load i8 addrspace(2)* %in
@@ -383,9 +381,9 @@ entry:
 }
 
 ; Load an un-aligned i8 value
-; FUNC-LABEL: @load_const_i8_unaligned
+; FUNC-LABEL: {{^}}load_const_i8_unaligned:
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
+; SI-CHECK: buffer_load_ubyte v{{[0-9]+}},
 define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = getelementptr i8 addrspace(2)* %in, i32 1
@@ -396,13 +394,13 @@ entry:
 }
 
 ; Load a sign-extended i16 value
-; FUNC-LABEL: @load_const_i16_sext
+; FUNC-LABEL: {{^}}load_const_i16_sext:
 ; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 16
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 16
-; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: buffer_load_sshort
 define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %0 = load i16 addrspace(2)* %in
@@ -412,9 +410,9 @@ entry:
 }
 
 ; Load an aligned i16 value
-; FUNC-LABEL: @load_const_i16_aligned
+; FUNC-LABEL: {{^}}load_const_i16_aligned:
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: buffer_load_ushort
 define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %0 = load i16 addrspace(2)* %in
@@ -424,9 +422,9 @@ entry:
 }
 
 ; Load an un-aligned i16 value
-; FUNC-LABEL: @load_const_i16_unaligned
+; FUNC-LABEL: {{^}}load_const_i16_unaligned:
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: buffer_load_ushort
 define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %0 = getelementptr i16 addrspace(2)* %in, i32 1
@@ -437,10 +435,10 @@ entry:
 }
 
 ; Load an i32 value from the constant address space.
-; FUNC-LABEL: @load_const_addrspace_i32
+; FUNC-LABEL: {{^}}load_const_addrspace_i32:
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]+}}
+; SI-CHECK: s_load_dword s{{[0-9]+}}
 define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %0 = load i32 addrspace(2)* %in
@@ -449,10 +447,10 @@ entry:
 }
 
 ; Load a f32 value from the constant address space.
-; FUNC-LABEL: @load_const_addrspace_f32
+; FUNC-LABEL: {{^}}load_const_addrspace_f32:
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]+}}
+; SI-CHECK: s_load_dword s{{[0-9]+}}
 define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
   %1 = load float addrspace(2)* %in
   store float %1, float addrspace(1)* %out
@@ -464,11 +462,11 @@ define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(
 ;===------------------------------------------------------------------------===;
 
 ; Load an i8 value from the local address space.
-; FUNC-LABEL: @load_i8_local
+; FUNC-LABEL: {{^}}load_i8_local:
 ; R600-CHECK: LDS_UBYTE_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_U8
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_u8
 define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
   %1 = load i8 addrspace(3)* %in
   %2 = zext i8 %1 to i32
@@ -476,12 +474,12 @@ define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
   ret void
 }
 
-; FUNC-LABEL: @load_i8_sext_local
+; FUNC-LABEL: {{^}}load_i8_sext_local:
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: ASHR
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_I8
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_i8
 define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
 entry:
   %0 = load i8 addrspace(3)* %in
@@ -490,13 +488,13 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v2i8_local
+; FUNC-LABEL: {{^}}load_v2i8_local:
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: LDS_UBYTE_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_U8
-; SI-CHECK: DS_READ_U8
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_u8
+; SI-CHECK: ds_read_u8
 define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
 entry:
   %0 = load <2 x i8> addrspace(3)* %in
@@ -505,15 +503,15 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v2i8_sext_local
+; FUNC-LABEL: {{^}}load_v2i8_sext_local:
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_I8
-; SI-CHECK: DS_READ_I8
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_i8
+; SI-CHECK: ds_read_i8
 define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
 entry:
   %0 = load <2 x i8> addrspace(3)* %in
@@ -522,17 +520,17 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v4i8_local
+; FUNC-LABEL: {{^}}load_v4i8_local:
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: LDS_UBYTE_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_U8
-; SI-CHECK: DS_READ_U8
-; SI-CHECK: DS_READ_U8
-; SI-CHECK: DS_READ_U8
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_u8
+; SI-CHECK: ds_read_u8
+; SI-CHECK: ds_read_u8
+; SI-CHECK: ds_read_u8
 define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
 entry:
   %0 = load <4 x i8> addrspace(3)* %in
@@ -541,7 +539,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v4i8_sext_local
+; FUNC-LABEL: {{^}}load_v4i8_sext_local:
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
@@ -550,12 +548,12 @@ entry:
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_I8
-; SI-CHECK: DS_READ_I8
-; SI-CHECK: DS_READ_I8
-; SI-CHECK: DS_READ_I8
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_i8
+; SI-CHECK: ds_read_i8
+; SI-CHECK: ds_read_i8
+; SI-CHECK: ds_read_i8
 define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
 entry:
   %0 = load <4 x i8> addrspace(3)* %in
@@ -565,11 +563,11 @@ entry:
 }
 
 ; Load an i16 value from the local address space.
-; FUNC-LABEL: @load_i16_local
+; FUNC-LABEL: {{^}}load_i16_local:
 ; R600-CHECK: LDS_USHORT_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_U16
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_u16
 define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
 entry:
   %0 = load i16	 addrspace(3)* %in
@@ -578,12 +576,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_i16_sext_local
+; FUNC-LABEL: {{^}}load_i16_sext_local:
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: ASHR
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_I16
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_i16
 define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
 entry:
   %0 = load i16 addrspace(3)* %in
@@ -592,13 +590,13 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v2i16_local
+; FUNC-LABEL: {{^}}load_v2i16_local:
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: LDS_USHORT_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_U16
-; SI-CHECK: DS_READ_U16
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_u16
+; SI-CHECK: ds_read_u16
 define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
 entry:
   %0 = load <2 x i16> addrspace(3)* %in
@@ -607,15 +605,15 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v2i16_sext_local
+; FUNC-LABEL: {{^}}load_v2i16_sext_local:
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_I16
-; SI-CHECK: DS_READ_I16
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_i16
+; SI-CHECK: ds_read_i16
 define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
 entry:
   %0 = load <2 x i16> addrspace(3)* %in
@@ -624,17 +622,17 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v4i16_local
+; FUNC-LABEL: {{^}}load_v4i16_local:
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: LDS_USHORT_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_U16
-; SI-CHECK: DS_READ_U16
-; SI-CHECK: DS_READ_U16
-; SI-CHECK: DS_READ_U16
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_u16
+; SI-CHECK: ds_read_u16
+; SI-CHECK: ds_read_u16
+; SI-CHECK: ds_read_u16
 define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
 entry:
   %0 = load <4 x i16> addrspace(3)* %in
@@ -643,7 +641,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @load_v4i16_sext_local
+; FUNC-LABEL: {{^}}load_v4i16_sext_local:
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
@@ -652,12 +650,12 @@ entry:
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_I16
-; SI-CHECK: DS_READ_I16
-; SI-CHECK: DS_READ_I16
-; SI-CHECK: DS_READ_I16
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_i16
+; SI-CHECK: ds_read_i16
+; SI-CHECK: ds_read_i16
+; SI-CHECK: ds_read_i16
 define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
 entry:
   %0 = load <4 x i16> addrspace(3)* %in
@@ -667,11 +665,11 @@ entry:
 }
 
 ; load an i32 value from the local address space.
-; FUNC-LABEL: @load_i32_local
+; FUNC-LABEL: {{^}}load_i32_local:
 ; R600-CHECK: LDS_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_B32
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_b32
 define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = load i32 addrspace(3)* %in
@@ -680,10 +678,10 @@ entry:
 }
 
 ; load a f32 value from the local address space.
-; FUNC-LABEL: @load_f32_local
+; FUNC-LABEL: {{^}}load_f32_local:
 ; R600-CHECK: LDS_READ_RET
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_B32
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_b32
 define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) {
 entry:
   %0 = load float addrspace(3)* %in
@@ -692,14 +690,50 @@ entry:
 }
 
 ; load a v2f32 value from the local address space
-; FUNC-LABEL: @load_v2f32_local
+; FUNC-LABEL: {{^}}load_v2f32_local:
 ; R600-CHECK: LDS_READ_RET
 ; R600-CHECK: LDS_READ_RET
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_B64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_b64
 define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
 entry:
   %0 = load <2 x float> addrspace(3)* %in
   store <2 x float> %0, <2 x float> addrspace(1)* %out
   ret void
 }
+
+; Test loading a i32 and v2i32 value from the same base pointer.
+; FUNC-LABEL: {{^}}load_i32_v2i32_local:
+; R600-CHECK: LDS_READ_RET
+; R600-CHECK: LDS_READ_RET
+; R600-CHECK: LDS_READ_RET
+; SI-CHECK-DAG: ds_read_b32
+; SI-CHECK-DAG: ds_read2_b32
+define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
+  %scalar = load i32 addrspace(3)* %in
+  %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
+  %vec_ptr = getelementptr <2 x i32> addrspace(3)* %tmp0, i32 2
+  %vec0 = load <2 x i32> addrspace(3)* %vec_ptr, align 4
+  %vec1 = insertelement <2 x i32> <i32 0, i32 0>, i32 %scalar, i32 0
+  %vec = add <2 x i32> %vec0, %vec1
+  store <2 x i32> %vec, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+
+@lds = addrspace(3) global [512 x i32] undef, align 4
+
+; On SI we need to make sure that the base offset is a register and not
+; an immediate.
+; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
+; SI-CHECK: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
+; SI-CHECK: ds_read_b32 v0, v[[ZERO]] offset:4
+; R600-CHECK: LDS_READ_RET
+define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %tmp0 = getelementptr [512 x i32] addrspace(3)* @lds, i32 0, i32 1
+  %tmp1 = load i32 addrspace(3)* %tmp0
+  %tmp2 = getelementptr i32 addrspace(1)* %out, i32 1
+  store i32 %tmp1, i32 addrspace(1)* %tmp2
+  ret void
+}
diff --git a/test/CodeGen/R600/load.vec.ll b/test/CodeGen/R600/load.vec.ll
index 81a6310..0d6e213 100644
--- a/test/CodeGen/R600/load.vec.ll
+++ b/test/CodeGen/R600/load.vec.ll
@@ -2,10 +2,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK  %s
 
 ; load a v2i32 value from the global address space.
-; EG-CHECK: @load_v2i32
+; EG-CHECK: {{^}}load_v2i32:
 ; EG-CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
-; SI-CHECK: @load_v2i32
-; SI-CHECK: BUFFER_LOAD_DWORDX2 v[{{[0-9]+:[0-9]+}}]
+; SI-CHECK: {{^}}load_v2i32:
+; SI-CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %a = load <2 x i32> addrspace(1) * %in
   store <2 x i32> %a, <2 x i32> addrspace(1)* %out
@@ -13,10 +13,10 @@ define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 }
 
 ; load a v4i32 value from the global address space.
-; EG-CHECK: @load_v4i32
+; EG-CHECK: {{^}}load_v4i32:
 ; EG-CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0
-; SI-CHECK: @load_v4i32
-; SI-CHECK: BUFFER_LOAD_DWORDX4 v[{{[0-9]+:[0-9]+}}]
+; SI-CHECK: {{^}}load_v4i32:
+; SI-CHECK: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}]
 define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %a = load <4 x i32> addrspace(1) * %in
   store <4 x i32> %a, <4 x i32> addrspace(1)* %out
diff --git a/test/CodeGen/R600/load64.ll b/test/CodeGen/R600/load64.ll
index a117557..a60c4eb 100644
--- a/test/CodeGen/R600/load64.ll
+++ b/test/CodeGen/R600/load64.ll
@@ -1,18 +1,18 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; load a f64 value from the global address space.
-; CHECK-LABEL: @load_f64:
-; CHECK: BUFFER_LOAD_DWORDX2 v[{{[0-9]+:[0-9]+}}]
-; CHECK: BUFFER_STORE_DWORDX2 v[{{[0-9]+:[0-9]+}}]
+; CHECK-LABEL: {{^}}load_f64:
+; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %1 = load double addrspace(1)* %in
   store double %1, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: @load_i64:
-; CHECK: BUFFER_LOAD_DWORDX2 v[{{[0-9]+:[0-9]+}}]
-; CHECK: BUFFER_STORE_DWORDX2 v[{{[0-9]+:[0-9]+}}]
+; CHECK-LABEL: {{^}}load_i64:
+; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tmp = load i64 addrspace(1)* %in
   store i64 %tmp, i64 addrspace(1)* %out, align 8
@@ -20,9 +20,9 @@ define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 }
 
 ; Load a f64 value from the constant address space.
-; CHECK-LABEL: @load_const_addrspace_f64:
-; CHECK: S_LOAD_DWORDX2 s[{{[0-9]+:[0-9]+}}]
-; CHECK: BUFFER_STORE_DWORDX2 v[{{[0-9]+:[0-9]+}}]
+; CHECK-LABEL: {{^}}load_const_addrspace_f64:
+; CHECK: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) {
   %1 = load double addrspace(2)* %in
   store double %1, double addrspace(1)* %out
diff --git a/test/CodeGen/R600/local-64.ll b/test/CodeGen/R600/local-64.ll
index c52b41b..eb14b5f 100644
--- a/test/CodeGen/R600/local-64.ll
+++ b/test/CodeGen/R600/local-64.ll
@@ -1,8 +1,9 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
 
-; SI-LABEL: @local_i32_load
-; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 0x1c, [M0]
-; SI: BUFFER_STORE_DWORD [[REG]],
+; BOTH-LABEL: {{^}}local_i32_load
+; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 [M0]
+; BOTH: buffer_store_dword [[REG]],
 define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %gep = getelementptr i32 addrspace(3)* %in, i32 7
   %val = load i32 addrspace(3)* %gep, align 4
@@ -10,19 +11,19 @@ define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounw
   ret void
 }
 
-; SI-LABEL: @local_i32_load_0_offset
-; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 0x0, [M0]
-; SI: BUFFER_STORE_DWORD [[REG]],
+; BOTH-LABEL: {{^}}local_i32_load_0_offset
+; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} [M0]
+; BOTH: buffer_store_dword [[REG]],
 define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %val = load i32 addrspace(3)* %in, align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: @local_i8_load_i16_max_offset
-; SI-NOT: ADD
-; SI: DS_READ_U8 [[REG:v[0-9]+]], {{v[0-9]+}}, 0xffff, [M0]
-; SI: BUFFER_STORE_BYTE [[REG]],
+; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset:
+; BOTH-NOT: ADD
+; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 [M0]
+; BOTH: buffer_store_byte [[REG]],
 define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8 addrspace(3)* %in, i32 65535
   %val = load i8 addrspace(3)* %gep, align 4
@@ -30,11 +31,14 @@ define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
   ret void
 }
 
-; SI-LABEL: @local_i8_load_over_i16_max_offset
-; SI: S_ADD_I32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
-; SI: V_MOV_B32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
-; SI: DS_READ_U8 [[REG:v[0-9]+]], [[VREGADDR]], 0x0, [M0]
-; SI: BUFFER_STORE_BYTE [[REG]],
+; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset:
+; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
+; SI, which is why it is being OR'd with the base pointer.
+; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
+; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] [M0]
+; BOTH: buffer_store_byte [[REG]],
 define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8 addrspace(3)* %in, i32 65536
   %val = load i8 addrspace(3)* %gep, align 4
@@ -42,10 +46,10 @@ define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspa
   ret void
 }
 
-; SI-LABEL: @local_i64_load
-; SI-NOT: ADD
-; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 0x38, [M0]
-; SI: BUFFER_STORE_DWORDX2 [[REG]],
+; BOTH-LABEL: {{^}}local_i64_load:
+; BOTH-NOT: ADD
+; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 [M0]
+; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %gep = getelementptr i64 addrspace(3)* %in, i32 7
   %val = load i64 addrspace(3)* %gep, align 8
@@ -53,19 +57,19 @@ define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounw
   ret void
 }
 
-; SI-LABEL: @local_i64_load_0_offset
-; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0x0, [M0]
-; SI: BUFFER_STORE_DWORDX2 [[REG]],
+; BOTH-LABEL: {{^}}local_i64_load_0_offset
+; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} [M0]
+; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %val = load i64 addrspace(3)* %in, align 8
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @local_f64_load
-; SI-NOT: ADD
-; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 0x38, [M0]
-; SI: BUFFER_STORE_DWORDX2 [[REG]],
+; BOTH-LABEL: {{^}}local_f64_load:
+; BOTH-NOT: ADD
+; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 [M0]
+; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %gep = getelementptr double addrspace(3)* %in, i32 7
   %val = load double addrspace(3)* %gep, align 8
@@ -73,85 +77,89 @@ define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in)
   ret void
 }
 
-; SI-LABEL: @local_f64_load_0_offset
-; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0x0, [M0]
-; SI: BUFFER_STORE_DWORDX2 [[REG]],
+; BOTH-LABEL: {{^}}local_f64_load_0_offset
+; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} [M0]
+; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %val = load double addrspace(3)* %in, align 8
   store double %val, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @local_i64_store
-; SI-NOT: ADD
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x38 [M0]
+; BOTH-LABEL: {{^}}local_i64_store:
+; BOTH-NOT: ADD
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 [M0]
 define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
   %gep = getelementptr i64 addrspace(3)* %out, i32 7
   store i64 5678, i64 addrspace(3)* %gep, align 8
   ret void
 }
 
-; SI-LABEL: @local_i64_store_0_offset
-; SI-NOT: ADD
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
+; BOTH-LABEL: {{^}}local_i64_store_0_offset:
+; BOTH-NOT: ADD
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
 define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
   store i64 1234, i64 addrspace(3)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @local_f64_store
-; SI-NOT: ADD
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x38 [M0]
+; BOTH-LABEL: {{^}}local_f64_store:
+; BOTH-NOT: ADD
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 [M0]
 define void @local_f64_store(double addrspace(3)* %out) nounwind {
   %gep = getelementptr double addrspace(3)* %out, i32 7
   store double 16.0, double addrspace(3)* %gep, align 8
   ret void
 }
 
-; SI-LABEL: @local_f64_store_0_offset
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
+; BOTH-LABEL: {{^}}local_f64_store_0_offset
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
 define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
   store double 20.0, double addrspace(3)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @local_v2i64_store
-; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x78 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x70 [M0]
+; BOTH-LABEL: {{^}}local_v2i64_store:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120 [M0]
+; BOTH: s_endpgm
 define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <2 x i64> addrspace(3)* %out, i32 7
   store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
   ret void
 }
 
-; SI-LABEL: @local_v2i64_store_0_offset
-; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x8 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
+; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 [M0]
+; BOTH: s_endpgm
 define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @local_v4i64_store
-; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xf8 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xf0 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xe8 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xe0 [M0]
+; BOTH-LABEL: {{^}}local_v4i64_store:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248 [M0]
+; BOTH: s_endpgm
 define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <4 x i64> addrspace(3)* %out, i32 7
   store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
   ret void
 }
 
-; SI-LABEL: @local_v4i64_store_0_offset
-; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x18 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x10 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x8 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
+; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24 [M0]
+; BOTH: s_endpgm
 define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
   ret void
diff --git a/test/CodeGen/R600/local-atomics.ll b/test/CodeGen/R600/local-atomics.ll
index 5a44951..2ac811f 100644
--- a/test/CodeGen/R600/local-atomics.ll
+++ b/test/CodeGen/R600/local-atomics.ll
@@ -1,21 +1,25 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; FUNC-LABEL: @lds_atomic_xchg_ret_i32:
-; SI: S_LOAD_DWORD [[SPTR:s[0-9]+]],
-; SI: V_MOV_B32_e32 [[DATA:v[0-9]+]], 4
-; SI: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: DS_WRXCHG_RTN_B32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]], 0x0, [M0]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
+; EG: LDS_WRXCHG_RET *
+; SI: s_load_dword [[SPTR:s[0-9]+]],
+; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_xchg_ret_i32_offset:
-; SI: DS_WRXCHG_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset:
+; EG: LDS_WRXCHG_RET *
+; SI: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -24,22 +28,24 @@ define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 }
 
 ; XXX - Is it really necessary to load 4 into VGPR?
-; FUNC-LABEL: @lds_atomic_add_ret_i32:
-; SI: S_LOAD_DWORD [[SPTR:s[0-9]+]],
-; SI: V_MOV_B32_e32 [[DATA:v[0-9]+]], 4
-; SI: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: DS_ADD_RTN_U32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]], 0x0, [M0]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
+; EG: LDS_ADD_RET *
+; SI: s_load_dword [[SPTR:s[0-9]+]],
+; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_add_ret_i32_offset:
-; SI: DS_ADD_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset:
+; EG: LDS_ADD_RET *
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -47,22 +53,38 @@ define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_inc_ret_i32:
-; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
-; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: DS_INC_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x0
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset:
+; EG: LDS_ADD_RET *
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} [M0]
+; CI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32:
+; EG: LDS_ADD_RET *
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] [M0]
+; SI: s_endpgm
 define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_inc_ret_i32_offset:
-; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
-; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: DS_INC_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
+; EG: LDS_ADD_RET *
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; SI: s_endpgm
 define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
@@ -70,18 +92,34 @@ define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_sub_ret_i32:
-; SI: DS_SUB_RTN_U32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset:
+; EG: LDS_ADD_RET *
+; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} [M0]
+; CI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32:
+; EG: LDS_SUB_RET *
+; SI: ds_sub_rtn_u32
+; SI: s_endpgm
 define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_sub_ret_i32_offset:
-; SI: DS_SUB_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset:
+; EG: LDS_SUB_RET *
+; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -89,22 +127,24 @@ define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_dec_ret_i32:
-; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
-; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: DS_DEC_RTN_U32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x0
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32:
+; EG: LDS_SUB_RET *
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_dec_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] [M0]
+; SI: s_endpgm
 define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_dec_ret_i32_offset:
-; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
-; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: DS_DEC_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
+; EG: LDS_SUB_RET *
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; SI: s_endpgm
 define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
@@ -112,18 +152,20 @@ define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_and_ret_i32:
-; SI: DS_AND_RTN_B32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32:
+; EG: LDS_AND_RET *
+; SI: ds_and_rtn_b32
+; SI: s_endpgm
 define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_and_ret_i32_offset:
-; SI: DS_AND_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset:
+; EG: LDS_AND_RET *
+; SI: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -131,18 +173,20 @@ define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_or_ret_i32:
-; SI: DS_OR_RTN_B32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32:
+; EG: LDS_OR_RET *
+; SI: ds_or_rtn_b32
+; SI: s_endpgm
 define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_or_ret_i32_offset:
-; SI: DS_OR_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset:
+; EG: LDS_OR_RET *
+; SI: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -150,18 +194,20 @@ define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_xor_ret_i32:
-; SI: DS_XOR_RTN_B32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32:
+; EG: LDS_XOR_RET *
+; SI: ds_xor_rtn_b32
+; SI: s_endpgm
 define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_xor_ret_i32_offset:
-; SI: DS_XOR_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset:
+; EG: LDS_XOR_RET *
+; SI: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -170,25 +216,27 @@ define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 }
 
 ; FIXME: There is no atomic nand instr
-; XFUNC-LABEL: @lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
+; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
 ; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i32 %result, i32 addrspace(1)* %out, align 4
 ;   ret void
 ; }
 
-; FUNC-LABEL: @lds_atomic_min_ret_i32:
-; SI: DS_MIN_RTN_I32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32:
+; EG: LDS_MIN_INT_RET *
+; SI: ds_min_rtn_i32
+; SI: s_endpgm
 define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_min_ret_i32_offset:
-; SI: DS_MIN_RTN_I32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset:
+; EG: LDS_MIN_INT_RET *
+; SI: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -196,18 +244,20 @@ define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_max_ret_i32:
-; SI: DS_MAX_RTN_I32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32:
+; EG: LDS_MAX_INT_RET *
+; SI: ds_max_rtn_i32
+; SI: s_endpgm
 define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_max_ret_i32_offset:
-; SI: DS_MAX_RTN_I32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset:
+; EG: LDS_MAX_INT_RET *
+; SI: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -215,18 +265,20 @@ define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umin_ret_i32:
-; SI: DS_MIN_RTN_U32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32:
+; EG: LDS_MIN_UINT_RET *
+; SI: ds_min_rtn_u32
+; SI: s_endpgm
 define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umin_ret_i32_offset:
-; SI: DS_MIN_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset:
+; EG: LDS_MIN_UINT_RET *
+; SI: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -234,21 +286,273 @@ define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umax_ret_i32:
-; SI: DS_MAX_RTN_U32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32:
+; EG: LDS_MAX_UINT_RET *
+; SI: ds_max_rtn_u32
+; SI: s_endpgm
 define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umax_ret_i32_offset:
-; SI: DS_MAX_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset:
+; EG: LDS_MAX_UINT_RET *
+; SI: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32:
+; SI: s_load_dword [[SPTR:s[0-9]+]],
+; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
+; SI: s_endpgm
+define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
+; SI: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; XXX - Is it really necessary to load 4 into VGPR?
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32:
+; SI: s_load_dword [[SPTR:s[0-9]+]],
+; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_add_u32 [[VPTR]], [[DATA]] [M0]
+; SI: s_endpgm
+define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} [M0]
+; CI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 [M0]
+; SI: s_endpgm
+define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32:
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] [M0]
+; SI: s_endpgm
+define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; SI: s_endpgm
+define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset:
+; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
+; SI: ds_sub_u32
+; SI: s_endpgm
+define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
+; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32:
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_dec_u32  v{{[0-9]+}}, [[NEGONE]]
+; SI: s_endpgm
+define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; SI: s_endpgm
+define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
+; SI: ds_and_b32
+; SI: s_endpgm
+define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
+; SI: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
+; SI: ds_or_b32
+; SI: s_endpgm
+define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
+; SI: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
+; SI: ds_xor_b32
+; SI: s_endpgm
+define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
+; SI: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
+; SI: ds_min_i32
+; SI: s_endpgm
+define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
+; SI: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
+; SI: ds_max_i32
+; SI: s_endpgm
+define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
+; SI: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
+; SI: ds_min_u32
+; SI: s_endpgm
+define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
+; SI: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
+; SI: ds_max_u32
+; SI: s_endpgm
+define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
+; SI: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/R600/local-atomics64.ll b/test/CodeGen/R600/local-atomics64.ll
index 849b033..ce0cf59 100644
--- a/test/CodeGen/R600/local-atomics64.ll
+++ b/test/CodeGen/R600/local-atomics64.ll
@@ -1,17 +1,17 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI %s
 
-; FUNC-LABEL: @lds_atomic_xchg_ret_i64:
-; SI: DS_WRXCHG_RTN_B64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64:
+; SI: ds_wrxchg_rtn_b64
+; SI: s_endpgm
 define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_xchg_ret_i64_offset:
-; SI: DS_WRXCHG_RTN_B64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
+; SI: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -19,24 +19,24 @@ define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_add_ret_i64:
-; SI: DS_ADD_RTN_U64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64:
+; SI: ds_add_rtn_u64
+; SI: s_endpgm
 define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_add_ret_i64_offset:
-; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: S_MOV_B64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
-; SI-DAG: V_MOV_B32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
-; SI-DAG: V_MOV_B32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
-; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI: DS_ADD_RTN_U64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}, 0x20, [M0]
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
+; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
 define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
@@ -44,22 +44,22 @@ define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_inc_ret_i64:
-; SI: S_MOV_B64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
-; SI-DAG: V_MOV_B32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
-; SI-DAG: V_MOV_B32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
-; SI: DS_INC_RTN_U64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}},
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64:
+; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
+; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
 define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_inc_ret_i64_offset:
-; SI: DS_INC_RTN_U64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset:
+; SI: ds_inc_rtn_u64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
@@ -67,18 +67,18 @@ define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_sub_ret_i64:
-; SI: DS_SUB_RTN_U64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64:
+; SI: ds_sub_rtn_u64
+; SI: s_endpgm
 define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_sub_ret_i64_offset:
-; SI: DS_SUB_RTN_U64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
+; SI: ds_sub_rtn_u64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -86,22 +86,22 @@ define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_dec_ret_i64:
-; SI: S_MOV_B64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
-; SI-DAG: V_MOV_B32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
-; SI-DAG: V_MOV_B32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
-; SI: DS_DEC_RTN_U64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}},
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64:
+; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
+; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
 define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_dec_ret_i64_offset:
-; SI: DS_DEC_RTN_U64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
+; SI: ds_dec_rtn_u64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
@@ -109,18 +109,18 @@ define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_and_ret_i64:
-; SI: DS_AND_RTN_B64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64:
+; SI: ds_and_rtn_b64
+; SI: s_endpgm
 define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_and_ret_i64_offset:
-; SI: DS_AND_RTN_B64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
+; SI: ds_and_rtn_b64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -128,18 +128,18 @@ define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_or_ret_i64:
-; SI: DS_OR_RTN_B64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64:
+; SI: ds_or_rtn_b64
+; SI: s_endpgm
 define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_or_ret_i64_offset:
-; SI: DS_OR_RTN_B64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
+; SI: ds_or_rtn_b64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -147,18 +147,18 @@ define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_xor_ret_i64:
-; SI: DS_XOR_RTN_B64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64:
+; SI: ds_xor_rtn_b64
+; SI: s_endpgm
 define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_xor_ret_i64_offset:
-; SI: DS_XOR_RTN_B64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
+; SI: ds_xor_rtn_b64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -167,25 +167,25 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 }
 
 ; FIXME: There is no atomic nand instr
-; XFUNC-LABEL: @lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
+; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
 ; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
 ; }
 
-; FUNC-LABEL: @lds_atomic_min_ret_i64:
-; SI: DS_MIN_RTN_I64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64:
+; SI: ds_min_rtn_i64
+; SI: s_endpgm
 define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_min_ret_i64_offset:
-; SI: DS_MIN_RTN_I64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
+; SI: ds_min_rtn_i64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -193,18 +193,18 @@ define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_max_ret_i64:
-; SI: DS_MAX_RTN_I64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64:
+; SI: ds_max_rtn_i64
+; SI: s_endpgm
 define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_max_ret_i64_offset:
-; SI: DS_MAX_RTN_I64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
+; SI: ds_max_rtn_i64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -212,18 +212,18 @@ define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umin_ret_i64:
-; SI: DS_MIN_RTN_U64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64:
+; SI: ds_min_rtn_u64
+; SI: s_endpgm
 define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umin_ret_i64_offset:
-; SI: DS_MIN_RTN_U64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
+; SI: ds_min_rtn_u64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -231,21 +231,243 @@ define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umax_ret_i64:
-; SI: DS_MAX_RTN_U64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64:
+; SI: ds_max_rtn_u64
+; SI: s_endpgm
 define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umax_ret_i64_offset:
-; SI: DS_MAX_RTN_U64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
+; SI: ds_max_rtn_u64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64:
+; SI: ds_wrxchg_rtn_b64
+; SI: s_endpgm
+define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
+; SI: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64:
+; SI: ds_add_u64
+; SI: s_endpgm
+define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
+; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
+; SI: s_endpgm
+define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i64 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
+; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
+; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; SI: s_endpgm
+define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset:
+; SI: ds_inc_u64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64:
+; SI: ds_sub_u64
+; SI: s_endpgm
+define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
+; SI: ds_sub_u64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
+; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
+; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; SI: s_endpgm
+define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
+; SI: ds_dec_u64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64:
+; SI: ds_and_b64
+; SI: s_endpgm
+define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
+; SI: ds_and_b64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64:
+; SI: ds_or_b64
+; SI: s_endpgm
+define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
+; SI: ds_or_b64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64:
+; SI: ds_xor_b64
+; SI: s_endpgm
+define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
+; SI: ds_xor_b64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64:
+; SI: ds_min_i64
+; SI: s_endpgm
+define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
+; SI: ds_min_i64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64:
+; SI: ds_max_i64
+; SI: s_endpgm
+define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
+; SI: ds_max_i64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64:
+; SI: ds_min_u64
+; SI: s_endpgm
+define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
+; SI: ds_min_u64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64:
+; SI: ds_max_u64
+; SI: s_endpgm
+define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
+; SI: ds_max_u64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index e29e4cc..88ef05d 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -1,10 +1,11 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=SI %s
+; RUN: llc < %s -march=r600 -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=CI %s
 
-@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4
-@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4
+@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
 
-; EG-CHECK: @local_memory_two_objects
+; EG-CHECK: {{^}}local_memory_two_objects:
 
 ; Check that the LDS size emitted correctly
 ; EG-CHECK: .long 166120
@@ -17,8 +18,8 @@
 ; this consistently on evergreen GPUs.
 ; EG-CHECK: LDS_WRITE
 ; EG-CHECK: LDS_WRITE
-; SI-CHECK: DS_WRITE_B32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
-; SI-CHECK-NOT: DS_WRITE_B32 {{v[0-9]*}}, v[[ADDRW]]
+; SI-CHECK: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
+; SI-CHECK-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]]
 
 ; GROUP_BARRIER must be the last instruction in a clause
 ; EG-CHECK: GROUP_BARRIER
@@ -28,8 +29,10 @@
 ; constant offsets.
 ; EG-CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
 ; EG-CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
-; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]], 0x10
-; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR]], 0x0,
+; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] [M0]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 [M0]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] [M0]
 
 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
index 51af484..9b13cb2 100644
--- a/test/CodeGen/R600/local-memory.ll
+++ b/test/CodeGen/R600/local-memory.ll
@@ -1,32 +1,30 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=CI-CHECK %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
-@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] zeroinitializer, align 4
+@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
 
-; EG-CHECK-LABEL: @local_memory
-; SI-CHECK-LABEL: @local_memory
-; CI-CHECK-LABEL: @local_memory
+; FUNC-LABEL: {{^}}local_memory:
 
 ; Check that the LDS size emitted correctly
-; EG-CHECK: .long 166120
-; EG-CHECK-NEXT: .long 128
-; SI-CHECK: .long 47180
-; SI-CHECK-NEXT: .long 65536
-; CI-CHECK: .long 47180
-; CI-CHECK-NEXT: .long 32768
+; EG: .long 166120
+; EG-NEXT: .long 128
+; SI: .long 47180
+; SI-NEXT: .long 65536
+; CI: .long 47180
+; CI-NEXT: .long 32768
 
-; EG-CHECK: LDS_WRITE
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: DS_WRITE_B32
+; EG: LDS_WRITE
+; SI-NOT: s_wqm_b64
+; SI: ds_write_b32
 
 ; GROUP_BARRIER must be the last instruction in a clause
-; EG-CHECK: GROUP_BARRIER
-; EG-CHECK-NEXT: ALU clause
-; SI-CHECK: S_BARRIER
+; EG: GROUP_BARRIER
+; EG-NEXT: ALU clause
+; SI: s_barrier
 
-; EG-CHECK: LDS_READ_RET
-; SI-CHECK: DS_READ_B32 {{v[0-9]+}},
+; EG: LDS_READ_RET
+; SI: ds_read_b32 {{v[0-9]+}},
 
 define void @local_memory(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/R600/loop-idiom.ll b/test/CodeGen/R600/loop-idiom.ll
index 128f661..0478bdb 100644
--- a/test/CodeGen/R600/loop-idiom.ll
+++ b/test/CodeGen/R600/loop-idiom.ll
@@ -10,8 +10,8 @@ target triple = "r600--"
 ; implementations of these for R600.
 
 ; FUNC: @no_memcpy
-; R600-NOT: @llvm.memcpy
-; SI-NOT: @llvm.memcpy
+; R600-NOT: {{^}}llvm.memcpy
+; SI-NOT: {{^}}llvm.memcpy
 define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
 entry:
   %dest = alloca i8, i32 32
@@ -32,10 +32,10 @@ for.end:
 }
 
 ; FUNC: @no_memset
-; R600-NOT: @llvm.memset
-; R600-NOT: @memset_pattern16
-; SI-NOT: @llvm.memset
-; SI-NOT: @memset_pattern16
+; R600-NOT: {{^}}llvm.memset
+; R600-NOT: {{^}}memset_pattern16:
+; SI-NOT: {{^}}llvm.memset
+; SI-NOT: {{^}}memset_pattern16:
 define void @no_memset(i32 %size) {
 entry:
   %dest = alloca i8, i32 32
diff --git a/test/CodeGen/R600/lshl.ll b/test/CodeGen/R600/lshl.ll
index 2162839..9785866 100644
--- a/test/CodeGen/R600/lshl.ll
+++ b/test/CodeGen/R600/lshl.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: S_LSHL_B32 s{{[0-9]}}, s{{[0-9]}}, 1
+;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
 
 define void @test(i32 %p) {
    %i = mul i32 %p, 2
diff --git a/test/CodeGen/R600/lshr.ll b/test/CodeGen/R600/lshr.ll
index 886d1c4..acfc1fd 100644
--- a/test/CodeGen/R600/lshr.ll
+++ b/test/CodeGen/R600/lshr.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: S_LSHR_B32 s{{[0-9]}}, s{{[0-9]}}, 1
+;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
 
 define void @test(i32 %p) {
    %i = udiv i32 %p, 2
diff --git a/test/CodeGen/R600/m0-spill.ll b/test/CodeGen/R600/m0-spill.ll
new file mode 100644
index 0000000..a8b0e0d
--- /dev/null
+++ b/test/CodeGen/R600/m0-spill.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+@lds = external addrspace(3) global [64 x float]
+
+; CHECK-LABEL: {{^}}main:
+; CHECK-NOT: v_readlane_b32 m0
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+main_body:
+  %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+  %cmp = fcmp ueq float 0.0, %4
+  br i1 %cmp, label %if, label %else
+
+if:
+  %lds_ptr = getelementptr [64 x float] addrspace(3)* @lds, i32 0, i32 0
+  %lds_data = load float addrspace(3)* %lds_ptr
+  br label %endif
+
+else:
+  %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+  br label %endif
+
+endif:
+  %export = phi float [%lds_data, %if], [%interp, %else]
+  %5 = call i32 @llvm.SI.packf16(float %export, float %export)
+  %6 = bitcast i32 %5 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6)
+  ret void
+}
+
+declare float @llvm.SI.fs.constant(i32, i32, i32) readnone
+
+declare i32 @llvm.SI.packf16(float, float) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/mad-sub.ll b/test/CodeGen/R600/mad-sub.ll
new file mode 100644
index 0000000..240abd0
--- /dev/null
+++ b/test/CodeGen/R600/mad-sub.ll
@@ -0,0 +1,215 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare float @llvm.fabs.f32(float) #0
+
+; FUNC-LABEL: {{^}}mad_sub_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
+  %a = load float addrspace(1)* %gep0, align 4
+  %b = load float addrspace(1)* %gep1, align 4
+  %c = load float addrspace(1)* %gep2, align 4
+  %mul = fmul float %a, %b
+  %sub = fsub float %mul, %c
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_inv_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
+  %a = load float addrspace(1)* %gep0, align 4
+  %b = load float addrspace(1)* %gep1, align 4
+  %c = load float addrspace(1)* %gep2, align 4
+  %mul = fmul float %a, %b
+  %sub = fsub float %c, %mul
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_f64:
+; SI: v_mul_f64
+; SI: v_add_f64
+define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr double addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr double addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr double addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr double addrspace(1)* %out, i64 %tid.ext
+  %a = load double addrspace(1)* %gep0, align 8
+  %b = load double addrspace(1)* %gep1, align 8
+  %c = load double addrspace(1)* %gep2, align 8
+  %mul = fmul double %a, %b
+  %sub = fsub double %mul, %c
+  store double %sub, double addrspace(1)* %outgep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_fabs_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
+  %a = load float addrspace(1)* %gep0, align 4
+  %b = load float addrspace(1)* %gep1, align 4
+  %c = load float addrspace(1)* %gep2, align 4
+  %c.abs = call float @llvm.fabs.f32(float %c) #0
+  %mul = fmul float %a, %b
+  %sub = fsub float %mul, %c.abs
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_fabs_inv_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
+  %a = load float addrspace(1)* %gep0, align 4
+  %b = load float addrspace(1)* %gep1, align 4
+  %c = load float addrspace(1)* %gep2, align 4
+  %c.abs = call float @llvm.fabs.f32(float %c) #0
+  %mul = fmul float %a, %b
+  %sub = fsub float %c.abs, %mul
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}neg_neg_mad_f32:
+; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
+  %a = load float addrspace(1)* %gep0, align 4
+  %b = load float addrspace(1)* %gep1, align 4
+  %c = load float addrspace(1)* %gep2, align 4
+  %nega = fsub float -0.000000e+00, %a
+  %negb = fsub float -0.000000e+00, %b
+  %mul = fmul float %nega, %negb
+  %sub = fadd float %mul, %c
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_fabs_sub_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
+  %a = load float addrspace(1)* %gep0, align 4
+  %b = load float addrspace(1)* %gep1, align 4
+  %c = load float addrspace(1)* %gep2, align 4
+  %b.abs = call float @llvm.fabs.f32(float %b) #0
+  %mul = fmul float %a, %b.abs
+  %sub = fsub float %mul, %c
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fsub_c_fadd_a_a:
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %add = fadd float %r1, %r1
+  %r3 = fsub float %r2, %add
+
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fsub_fadd_a_a_c:
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %add = fadd float %r1, %r1
+  %r3 = fsub float %add, %r2
+
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/R600/mad_int24.ll b/test/CodeGen/R600/mad_int24.ll
index abb5290..c8dd377 100644
--- a/test/CodeGen/R600/mad_int24.ll
+++ b/test/CodeGen/R600/mad_int24.ll
@@ -2,14 +2,16 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; FUNC-LABEL: @i32_mad24
+declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}i32_mad24:
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
 ; EG: MULLO_INT
 ; Make sure we aren't masking the inputs.
 ; CM-NOT: AND
 ; CM: MULADD_INT24
-; SI-NOT: AND
-; SI: V_MAD_I32_I24
+; SI-NOT: and
+; SI: v_mad_i32_i24
 define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
@@ -21,3 +23,12 @@ entry:
   store i32 %3, i32 addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @test_imul24
+; SI: v_mad_i32_i24
+define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
+  %add = add i32 %mul, %src2
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll
index 0f0893b..b7b32fe 100644
--- a/test/CodeGen/R600/mad_uint24.ll
+++ b/test/CodeGen/R600/mad_uint24.ll
@@ -2,9 +2,9 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; FUNC-LABEL: @u32_mad24
+; FUNC-LABEL: {{^}}u32_mad24:
 ; EG: MULADD_UINT24
-; SI: V_MAD_U32_U24
+; SI: v_mad_u32_u24
 
 define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
@@ -18,14 +18,14 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i16_mad24
+; FUNC-LABEL: {{^}}i16_mad24:
 ; The order of A and B does not matter.
 ; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
 ; The result must be sign-extended
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
 ; EG: 16
-; SI: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16
+; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
 
 define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 entry:
@@ -36,13 +36,13 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i8_mad24
+; FUNC-LABEL: {{^}}i8_mad24:
 ; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
 ; The result must be sign-extended
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
 ; EG: 8
-; SI: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8
+; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
 
 define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 entry:
@@ -60,9 +60,9 @@ entry:
 ; 24-bit mad pattern wasn't being matched.
 
 ; Check that the select instruction is not deleted.
-; FUNC-LABEL: @i24_i32_i32_mad
+; FUNC-LABEL: {{^}}i24_i32_i32_mad:
 ; EG: CNDE_INT
-; SI: V_CNDMASK
+; SI: v_cndmask
 define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   %0 = ashr i32 %a, 8
diff --git a/test/CodeGen/R600/max-literals.ll b/test/CodeGen/R600/max-literals.ll
index 65a6d2b..c357524 100644
--- a/test/CodeGen/R600/max-literals.ll
+++ b/test/CodeGen/R600/max-literals.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: @main
+; CHECK-LABEL: {{^}}main:
 ; CHECK: ADD *
 
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
@@ -29,7 +29,7 @@ main_body:
   ret void
 }
 
-; CHECK: @main
+; CHECK-LABEL: {{^}}main2:
 ; CHECK-NOT: ADD *
 
 define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
diff --git a/test/CodeGen/R600/max.ll b/test/CodeGen/R600/max.ll
new file mode 100644
index 0000000..d67ef47
--- /dev/null
+++ b/test/CodeGen/R600/max.ll
@@ -0,0 +1,99 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imax_sge_i32
+; SI: v_max_i32_e32
+define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp sge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imax_sge_i32
+; SI: s_max_i32
+define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_imax_sgt_i32
+; SI: v_max_i32_e32
+define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp sgt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imax_sgt_i32
+; SI: s_max_i32
+define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umax_uge_i32
+; SI: v_max_u32_e32
+define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp uge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umax_uge_i32
+; SI: s_max_u32
+define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umax_ugt_i32
+; SI: v_max_u32_e32
+define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ugt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umax_ugt_i32
+; SI: s_max_u32
+define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/max3.ll b/test/CodeGen/R600/max3.ll
new file mode 100644
index 0000000..74b08f6
--- /dev/null
+++ b/test/CodeGen/R600/max3.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imax3_sgt_i32
+; SI: v_max3_i32
+define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %c = load i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp sgt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp sgt i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umax3_ugt_i32
+; SI: v_max3_u32
+define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %c = load i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp ugt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp ugt i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/min.ll b/test/CodeGen/R600/min.ll
new file mode 100644
index 0000000..88c0dff
--- /dev/null
+++ b/test/CodeGen/R600/min.ll
@@ -0,0 +1,99 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imin_sle_i32
+; SI: v_min_i32_e32
+define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp sle i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imin_sle_i32
+; SI: s_min_i32
+define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_imin_slt_i32
+; SI: v_min_i32_e32
+define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp slt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imin_slt_i32
+; SI: s_min_i32
+define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_ule_i32
+; SI: v_min_u32_e32
+define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ule i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umin_ule_i32
+; SI: s_min_u32
+define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_ult_i32
+; SI: v_min_u32_e32
+define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ult i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umin_ult_i32
+; SI: s_min_u32
+define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/min3.ll b/test/CodeGen/R600/min3.ll
new file mode 100644
index 0000000..f852cff
--- /dev/null
+++ b/test/CodeGen/R600/min3.ll
@@ -0,0 +1,111 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imin3_slt_i32
+; SI: v_min3_i32
+define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %c = load i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp slt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp slt i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin3_ult_i32
+; SI: v_min3_u32
+define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %c = load i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp ult i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp ult i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_umin_umin
+; SI: v_min_i32
+; SI: v_min3_i32
+define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid2 = mul i32 %tid, 2
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+
+  %gep3 = getelementptr i32 addrspace(1)* %aptr, i32 %tid2
+  %gep4 = getelementptr i32 addrspace(1)* %bptr, i32 %tid2
+  %gep5 = getelementptr i32 addrspace(1)* %cptr, i32 %tid2
+
+  %outgep0 = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %outgep1 = getelementptr i32 addrspace(1)* %out, i32 %tid2
+
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %c = load i32 addrspace(1)* %gep2, align 4
+  %d = load i32 addrspace(1)* %gep3, align 4
+
+  %icmp0 = icmp slt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+
+  %icmp1 = icmp slt i32 %c, %d
+  %i1 = select i1 %icmp1, i32 %c, i32 %d
+
+  %icmp2 = icmp slt i32 %i0, %i1
+  %i2 = select i1 %icmp2, i32 %i0, i32 %i1
+
+  store i32 %i2, i32 addrspace(1)* %outgep1, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin3_2_uses
+; SI-NOT: v_min3
+define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid2 = mul i32 %tid, 2
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+
+  %gep3 = getelementptr i32 addrspace(1)* %aptr, i32 %tid2
+  %gep4 = getelementptr i32 addrspace(1)* %bptr, i32 %tid2
+  %gep5 = getelementptr i32 addrspace(1)* %cptr, i32 %tid2
+
+  %outgep0 = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %outgep1 = getelementptr i32 addrspace(1)* %out, i32 %tid2
+
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %c = load i32 addrspace(1)* %gep2, align 4
+  %d = load i32 addrspace(1)* %gep3, align 4
+
+  %icmp0 = icmp slt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+
+  %icmp1 = icmp slt i32 %c, %d
+  %i1 = select i1 %icmp1, i32 %c, i32 %d
+
+  %icmp2 = icmp slt i32 %i0, %c
+  %i2 = select i1 %icmp2, i32 %i0, i32 %c
+
+  store i32 %i2, i32 addrspace(1)* %outgep0, align 4
+  store i32 %i0, i32 addrspace(1)* %outgep1, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/missing-store.ll b/test/CodeGen/R600/missing-store.ll
new file mode 100644
index 0000000..5346046
--- /dev/null
+++ b/test/CodeGen/R600/missing-store.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+
+@ptr_load = addrspace(3) global i32 addrspace(2)* undef, align 8
+
+; Make sure when the load from %ptr2 is folded the chain isn't lost,
+; resulting in losing the store to gptr
+
+; FUNC-LABEL: {{^}}missing_store_reduced:
+; SI: ds_read_b64
+; SI: buffer_store_dword
+; SI: buffer_load_dword
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(2)* addrspace(3)* @ptr_load, align 8
+  %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2
+
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32 addrspace(2)* %ptr2, align 4
+
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/R600/mubuf.ll b/test/CodeGen/R600/mubuf.ll
index f465d3d..c2efda4 100644
--- a/test/CodeGen/R600/mubuf.ll
+++ b/test/CodeGen/R600/mubuf.ll
@@ -1,12 +1,14 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.r600.read.tidig.x() readnone
 
 ;;;==========================================================================;;;
 ;;; MUBUF LOAD TESTS
 ;;;==========================================================================;;;
 
 ; MUBUF load with an immediate byte offset that fits into 12-bits
-; CHECK-LABEL: @mubuf_load0
-; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
+; CHECK-LABEL: {{^}}mubuf_load0:
+; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0x4 ; encoding: [0x04,0x00,0x30,0xe0
 define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 1
@@ -16,8 +18,8 @@ entry:
 }
 
 ; MUBUF load with the largest possible immediate offset
-; CHECK-LABEL: @mubuf_load1
-; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0xfff ; encoding: [0xff,0x8f
+; CHECK-LABEL: {{^}}mubuf_load1:
+; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0xfff ; encoding: [0xff,0x0f,0x20,0xe0
 define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i8 addrspace(1)* %in, i64 4095
@@ -27,8 +29,8 @@ entry:
 }
 
 ; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
-; CHECK-LABEL: @mubuf_load2
-; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x0 ; encoding: [0x00,0x80
+; CHECK-LABEL: {{^}}mubuf_load2:
+; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 ; encoding: [0x00,0x80,0x30,0xe0
 define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 1024
@@ -38,9 +40,9 @@ entry:
 }
 
 ; MUBUF load with a 12-bit immediate offset and a register offset
-; CHECK-LABEL: @mubuf_load3
+; CHECK-LABEL: {{^}}mubuf_load3:
 ; CHECK-NOT: ADD
-; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
+; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x30,0xe0
 define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 %offset
@@ -55,8 +57,8 @@ entry:
 ;;;==========================================================================;;;
 
 ; MUBUF store with an immediate byte offset that fits into 12-bits
-; CHECK-LABEL: @mubuf_store0
-; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
+; CHECK-LABEL: {{^}}mubuf_store0:
+; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0x4 ; encoding: [0x04,0x00,0x70,0xe0
 define void @mubuf_store0(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 1
@@ -65,8 +67,8 @@ entry:
 }
 
 ; MUBUF store with the largest possible immediate offset
-; CHECK-LABEL: @mubuf_store1
-; CHECK: BUFFER_STORE_BYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0xfff ; encoding: [0xff,0x8f
+; CHECK-LABEL: {{^}}mubuf_store1:
+; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0xfff ; encoding: [0xff,0x0f,0x60,0xe0
 
 define void @mubuf_store1(i8 addrspace(1)* %out) {
 entry:
@@ -76,8 +78,8 @@ entry:
 }
 
 ; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
-; CHECK-LABEL: @mubuf_store2
-; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x0 ; encoding: [0x00,0x80
+; CHECK-LABEL: {{^}}mubuf_store2:
+; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0 addr64 ; encoding: [0x00,0x80,0x70,0xe0
 define void @mubuf_store2(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 1024
@@ -86,9 +88,9 @@ entry:
 }
 
 ; MUBUF store with a 12-bit immediate offset and a register offset
-; CHECK-LABEL: @mubuf_store3
+; CHECK-LABEL: {{^}}mubuf_store3:
 ; CHECK-NOT: ADD
-; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
+; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x70,0xe0
 define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 %offset
@@ -96,3 +98,35 @@ entry:
   store i32 0, i32 addrspace(1)* %1
   ret void
 }
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr:
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0
+define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
+  store i32 99, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:0x28
+define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
+  %out.gep = getelementptr i32 addrspace(1)* %out, i32 10
+  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
+  %out.gep = getelementptr i32 addrspace(1)* %out, i32 32768
+  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_vgpr_ptr:
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
index d231e92..be5d6a0 100644
--- a/test/CodeGen/R600/mul.ll
+++ b/test/CodeGen/R600/mul.ll
@@ -3,14 +3,14 @@
 
 ; mul24 and mad24 are affected
 
-; FUNC-LABEL: @test2
+; FUNC-LABEL: {{^}}test_mul_v2i32:
 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32> addrspace(1) * %in
   %b = load <2 x i32> addrspace(1) * %b_ptr
@@ -19,18 +19,18 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-; FUNC-LABEL: @test4
+; FUNC-LABEL: {{^}}v_mul_v4i32:
 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
@@ -39,12 +39,26 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   ret void
 }
 
-; FUNC-LABEL: @trunc_i64_mul_to_i32
-; SI: S_LOAD_DWORD
-; SI: S_LOAD_DWORD
-; SI: V_MUL_LO_I32
-; SI: BUFFER_STORE_DWORD
-define void @trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32:
+; SI: s_load_dword
+; SI: s_load_dword
+; SI: s_mul_i32
+; SI: buffer_store_dword
+define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+  %mul = mul i64 %b, %a
+  %trunc = trunc i64 %mul to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32:
+; SI: s_load_dword
+; SI: s_load_dword
+; SI: v_mul_lo_i32
+; SI: buffer_store_dword
+define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %b = load i64 addrspace(1)* %bptr, align 8
   %mul = mul i64 %b, %a
   %trunc = trunc i64 %mul to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -53,11 +67,11 @@ define void @trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 
 ; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
 ; 32-bits of both arguments are sign bits.
-; FUNC-LABEL: @mul64_sext_c
+; FUNC-LABEL: {{^}}mul64_sext_c:
 ; EG-DAG: MULLO_INT
 ; EG-DAG: MULHI_INT
-; SI-DAG: V_MUL_LO_I32
-; SI-DAG: V_MUL_HI_I32
+; SI-DAG: s_mul_i32
+; SI-DAG: v_mul_hi_i32
 define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = sext i32 %in to i64
@@ -66,16 +80,120 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_mul64_sext_c:
+; EG-DAG: MULLO_INT
+; EG-DAG: MULHI_INT
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_i32
+; SI: s_endpgm
+define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ext = sext i32 %val to i64
+  %mul = mul i64 %ext, 80
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
+; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
+; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
+; SI: s_endpgm
+define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ext = sext i32 %val to i64
+  %mul = mul i64 %ext, 9
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_mul_i32:
+; SI: s_load_dword [[SRC0:s[0-9]+]],
+; SI: s_load_dword [[SRC1:s[0-9]+]],
+; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %mul = mul i32 %a, %b
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul_i32:
+; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %result = mul i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
 ; A standard 64-bit multiply.  The expansion should be around 6 instructions.
 ; It would be difficult to match the expansion correctly without writing
 ; a really complicated list of FileCheck expressions.  I don't want
 ; to confuse people who may 'break' this test with a correct optimization,
 ; so this test just uses FUNC-LABEL to make sure the compiler does not
 ; crash with a 'failed to select' error.
-; FUNC-LABEL: @mul64
-define void @mul64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+
+; FUNC-LABEL: {{^}}s_mul_i64:
+define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %mul = mul i64 %a, %b
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul_i64:
+; SI: v_mul_lo_i32
+define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %b = load i64 addrspace(1)* %bptr, align 8
+  %mul = mul i64 %a, %b
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mul32_in_branch:
+; SI: s_mul_i32
+define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = icmp eq i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i32 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = mul i32 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i32 [%1, %if], [%2, %else]
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mul64_in_branch:
+; SI-DAG: s_mul_i32
+; SI-DAG: v_mul_hi_u32
+; SI: s_endpgm
+define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
-  %0 = mul i64 %a, %b
-  store i64 %0, i64 addrspace(1)* %out
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = mul i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/mul_int24.ll b/test/CodeGen/R600/mul_int24.ll
index 046911b..be58f7e 100644
--- a/test/CodeGen/R600/mul_int24.ll
+++ b/test/CodeGen/R600/mul_int24.ll
@@ -2,14 +2,14 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; FUNC-LABEL: @i32_mul24
+; FUNC-LABEL: {{^}}i32_mul24:
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
 ; EG: MULLO_INT
 ; Make sure we are not masking the inputs
 ; CM-NOT: AND
 ; CM: MUL_INT24
-; SI-NOT: AND
-; SI: V_MUL_I32_I24
+; SI-NOT: and
+; SI: v_mul_i32_i24
 define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = shl i32 %a, 8
diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll
index 419f275..8d1cda8 100644
--- a/test/CodeGen/R600/mul_uint24.ll
+++ b/test/CodeGen/R600/mul_uint24.ll
@@ -2,9 +2,9 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; FUNC-LABEL: @u32_mul24
+; FUNC-LABEL: {{^}}u32_mul24:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI: V_MUL_U32_U24
+; SI: v_mul_u32_u24
 
 define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
@@ -17,13 +17,13 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i16_mul24
+; FUNC-LABEL: {{^}}i16_mul24:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; The result must be sign-extended
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
 ; EG: 16
-; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16,
+; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
 define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %0 = mul i16 %a, %b
@@ -32,12 +32,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i8_mul24
+; FUNC-LABEL: {{^}}i8_mul24:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; The result must be sign-extended
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8,
+; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
 
 define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
 entry:
@@ -48,12 +48,12 @@ entry:
 }
 
 ; Multiply with 24-bit inputs and 64-bit output
-; FUNC_LABEL: @mul24_i64
+; FUNC_LABEL: {{^}}mul24_i64:
 ; EG; MUL_UINT24
 ; EG: MULHI
-; SI: V_MUL_U32_U24
+; SI: v_mul_u32_u24
 ; FIXME: SI support 24-bit mulhi
-; SI: V_MUL_HI_U32
+; SI: v_mul_hi_u32
 define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = shl i64 %a, 40
diff --git a/test/CodeGen/R600/mulhu.ll b/test/CodeGen/R600/mulhu.ll
index 8640127..82a0783 100644
--- a/test/CodeGen/R600/mulhu.ll
+++ b/test/CodeGen/R600/mulhu.ll
@@ -1,8 +1,8 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_MOV_B32_e32 v{{[0-9]+}}, 0xaaaaaaab
-;CHECK: V_MUL_HI_U32 v0, {{[sv][0-9]+}}, {{v[0-9]+}}
-;CHECK-NEXT: V_LSHRREV_B32_e32 v0, 1, v0
+;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
+;CHECK: v_mul_hi_u32 v0, {{[sv][0-9]+}}, {{v[0-9]+}}
+;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
 
 define void @test(i32 %p) {
    %i = udiv i32 %p, 3
diff --git a/test/CodeGen/R600/no-initializer-constant-addrspace.ll b/test/CodeGen/R600/no-initializer-constant-addrspace.ll
index ab82e7e..cd2dca3 100644
--- a/test/CodeGen/R600/no-initializer-constant-addrspace.ll
+++ b/test/CodeGen/R600/no-initializer-constant-addrspace.ll
@@ -3,7 +3,7 @@
 
 @extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
 
-; FUNC-LABEL: @load_extern_const_init
+; FUNC-LABEL: {{^}}load_extern_const_init:
 define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32 addrspace(2)* getelementptr ([5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -12,7 +12,7 @@ define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
 
 @undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
 
-; FUNC-LABEL: @load_undef_const_init
+; FUNC-LABEL: {{^}}load_undef_const_init:
 define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32 addrspace(2)* getelementptr ([5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/operand-spacing.ll b/test/CodeGen/R600/operand-spacing.ll
new file mode 100644
index 0000000..f0d228d
--- /dev/null
+++ b/test/CodeGen/R600/operand-spacing.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+; Make sure there isn't an extra space between the instruction name and first operands.
+
+; SI-LABEL: {{^}}add_f32:
+; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
+; SI: buffer_store_dword [[RESULT]],
+define void @add_f32(float addrspace(1)* %out, float %a, float %b) {
+  %result = fadd float %a, %b
+  store float %result, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
index 91a70b7..b7493d3 100644
--- a/test/CodeGen/R600/or.ll
+++ b/test/CodeGen/R600/or.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
-; EG-LABEL: @or_v2i32
+; EG-LABEL: {{^}}or_v2i32:
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI-LABEL: @or_v2i32
-; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI-LABEL: {{^}}or_v2i32:
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +18,17 @@ define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in)
   ret void
 }
 
-; EG-LABEL: @or_v4i32
+; EG-LABEL: {{^}}or_v4i32:
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI-LABEL: @or_v4i32
-; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI-LABEL: {{^}}or_v4i32:
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,16 +39,16 @@ define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in)
   ret void
 }
 
-; SI-LABEL: @scalar_or_i32
-; SI: S_OR_B32
+; SI-LABEL: {{^}}scalar_or_i32:
+; SI: s_or_b32
 define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %or = or i32 %a, %b
   store i32 %or, i32 addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: @vector_or_i32
-; SI: V_OR_B32_e32 v{{[0-9]}}
+; SI-LABEL: {{^}}vector_or_i32:
+; SI: v_or_b32_e32 v{{[0-9]}}
 define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
   %loada = load i32 addrspace(1)* %a
   %or = or i32 %loada, %b
@@ -56,20 +56,46 @@ define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b)
   ret void
 }
 
-; EG-LABEL: @scalar_or_i64
+; SI-LABEL: {{^}}scalar_or_literal_i32:
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
+define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
+  %or = or i32 %a, 99999
+  store i32 %or, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}vector_or_literal_i32:
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+  %loada = load i32 addrspace(1)* %a, align 4
+  %or = or i32 %loada, 65535
+  store i32 %or, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}vector_or_inline_immediate_i32:
+; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
+define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+  %loada = load i32 addrspace(1)* %a, align 4
+  %or = or i32 %loada, 4
+  store i32 %or, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-LABEL: {{^}}scalar_or_i64:
 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; SI-LABEL: @scalar_or_i64
-; SI: S_OR_B64
+; SI-LABEL: {{^}}scalar_or_i64:
+; SI: s_or_b64
 define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, %b
   store i64 %or, i64 addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: @vector_or_i64
-; SI: V_OR_B32_e32 v{{[0-9]}}
-; SI: V_OR_B32_e32 v{{[0-9]}}
+; SI-LABEL: {{^}}vector_or_i64:
+; SI: v_or_b32_e32 v{{[0-9]}}
+; SI: v_or_b32_e32 v{{[0-9]}}
 define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64 addrspace(1)* %a, align 8
   %loadb = load i64 addrspace(1)* %a, align 8
@@ -78,9 +104,9 @@ define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
   ret void
 }
 
-; SI-LABEL: @scalar_vector_or_i64
-; SI: V_OR_B32_e32 v{{[0-9]}}
-; SI: V_OR_B32_e32 v{{[0-9]}}
+; SI-LABEL: {{^}}scalar_vector_or_i64:
+; SI: v_or_b32_e32 v{{[0-9]}}
+; SI: v_or_b32_e32 v{{[0-9]}}
 define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
   %loada = load i64 addrspace(1)* %a
   %or = or i64 %loada, %b
@@ -88,13 +114,13 @@ define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
   ret void
 }
 
-; SI-LABEL: @vector_or_i64_loadimm
-; SI-DAG: S_MOV_B32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
-; SI-DAG: S_MOV_B32 [[HI_S_IMM:s[0-9]+]], 0x146f
-; SI-DAG: BUFFER_LOAD_DWORDX2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: V_OR_B32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: V_OR_B32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}vector_or_i64_loadimm:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
+; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
 define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 22470723082367
@@ -103,11 +129,11 @@ define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
 }
 
 ; FIXME: The or 0 should really be removed.
-; SI-LABEL: @vector_or_i64_imm
-; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI: V_OR_B32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]]
-; SI: V_OR_B32_e32 {{v[0-9]+}}, 0, {{.*}}
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}vector_or_i64_imm:
+; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]]
+; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}}
+; SI: s_endpgm
 define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 8
@@ -115,15 +141,31 @@ define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64
   ret void
 }
 
-; SI-LABEL: @trunc_i64_or_to_i32
-; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]]
-; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]]
-; SI: S_OR_B32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI-LABEL: {{^}}trunc_i64_or_to_i32:
+; SI: s_load_dword s[[SREG0:[0-9]+]]
+; SI: s_load_dword s[[SREG1:[0-9]+]]
+; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
 define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = or i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
   ret void
 }
+
+; EG-CHECK: {{^}}or_i1:
+; EG-CHECK: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+
+; SI-CHECK: {{^}}or_i1:
+; SI-CHECK: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+define void @or_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+  %a = load float addrspace(1) * %in0
+  %b = load float addrspace(1) * %in1
+  %acmp = fcmp oge float %a, 0.000000e+00
+  %bcmp = fcmp oge float %b, 0.000000e+00
+  %or = or i1 %acmp, %bcmp
+  %result = select i1 %or, float %a, float %b
+  store float %result, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/packetizer.ll b/test/CodeGen/R600/packetizer.ll
index 0a405c5..49a7c0d 100644
--- a/test/CodeGen/R600/packetizer.ll
+++ b/test/CodeGen/R600/packetizer.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
 
-; CHECK: @test
+; CHECK: {{^}}test:
 ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X
 ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y
 ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
diff --git a/test/CodeGen/R600/parallelandifcollapse.ll b/test/CodeGen/R600/parallelandifcollapse.ll
index 8a269e0..82b1150 100644
--- a/test/CodeGen/R600/parallelandifcollapse.ll
+++ b/test/CodeGen/R600/parallelandifcollapse.ll
@@ -1,5 +1,5 @@
 ; Function Attrs: nounwind
-; RUN: llc < %s -march=r600 -mcpu=redwood  | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s
 ;
 ; CFG flattening should use parallel-and mode to generate branch conditions and
 ; then merge if-regions with the same bodies.
@@ -11,7 +11,6 @@
 ; FIXME: For some reason having the allocas here allowed the flatten cfg pass
 ; to do its transfomation, however now that we are using local memory for
 ; allocas, the transformation isn't happening.
-; XFAIL: *
 
 define void @_Z9chk1D_512v() #0 {
 entry:
diff --git a/test/CodeGen/R600/predicate-dp4.ll b/test/CodeGen/R600/predicate-dp4.ll
index e48d6a7..6bc1875 100644
--- a/test/CodeGen/R600/predicate-dp4.ll
+++ b/test/CodeGen/R600/predicate-dp4.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman
 
-; CHECK-LABEL: @main
+; CHECK-LABEL: {{^}}main:
 ; CHECK: PRED_SETE_INT * Pred,
 ; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one
 define void @main(<4 x float> inreg) #0 {
diff --git a/test/CodeGen/R600/predicates.ll b/test/CodeGen/R600/predicates.ll
index 902508f..0ce74d9 100644
--- a/test/CodeGen/R600/predicates.ll
+++ b/test/CodeGen/R600/predicates.ll
@@ -3,7 +3,7 @@
 ; These tests make sure the compiler is optimizing branches using predicates
 ; when it is legal to do so.
 
-; CHECK: @simple_if
+; CHECK: {{^}}simple_if:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
@@ -21,7 +21,7 @@ ENDIF:
   ret void
 }
 
-; CHECK: @simple_if_else
+; CHECK: {{^}}simple_if_else:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
@@ -44,7 +44,7 @@ ENDIF:
   ret void
 }
 
-; CHECK: @nested_if
+; CHECK: {{^}}nested_if:
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK: JUMP
 ; CHECK: POP
@@ -71,7 +71,7 @@ ENDIF:
   ret void
 }
 
-; CHECK: @nested_if_else
+; CHECK: {{^}}nested_if_else:
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK: JUMP
 ; CHECK: POP
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index 89122be..bfb4a6a 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -1,19 +1,23 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-; FUNC-LABEL: @mova_same_clause
+; FUNC-LABEL: {{^}}mova_same_clause:
 
-; R600-CHECK: LDS_WRITE
-; R600-CHECK: LDS_WRITE
-; R600-CHECK: LDS_READ
-; R600-CHECK: LDS_READ
+; R600: LDS_WRITE
+; R600: LDS_WRITE
+; R600: LDS_READ
+; R600: LDS_READ
 
-; SI-CHECK: DS_WRITE_B32
-; SI-CHECK: DS_WRITE_B32
-; SI-CHECK: DS_READ_B32
-; SI-CHECK: DS_READ_B32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
@@ -41,9 +45,10 @@ entry:
 ; XXX: This generated code has unnecessary MOVs, we should be able to optimize
 ; this.
 
-; FUNC-LABEL: @multiple_structs
-; R600-CHECK-NOT: MOVA_INT
-; SI-CHECK-NOT: V_MOVREL
+; FUNC-LABEL: {{^}}multiple_structs:
+; R600-NOT: MOVA_INT
+; SI-NOT: v_movrel
+; SI-NOT: v_movrel
 %struct.point = type { i32, i32 }
 
 define void @multiple_structs(i32 addrspace(1)* %out) {
@@ -71,9 +76,9 @@ entry:
 ; loads and stores should be lowered to copies, so there shouldn't be any
 ; MOVA instructions.
 
-; FUNC-LABEL: @direct_loop
-; R600-CHECK-NOT: MOVA_INT
-; SI-CHECK-NOT: V_MOVREL
+; FUNC-LABEL: {{^}}direct_loop:
+; R600-NOT: MOVA_INT
+; SI-NOT: v_movrel
 
 define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
@@ -107,11 +112,13 @@ for.end:
   ret void
 }
 
-; FUNC-LABEL: @short_array
+; FUNC-LABEL: {{^}}short_array:
 
-; R600-CHECK: MOVA_INT
+; R600: MOVA_INT
 
-; SI-CHECK: V_MOVRELS_B32_e32
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x2 ; encoding: [0x02,0x10,0x68,0xe0
+; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 define void @short_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = alloca [2 x i16]
@@ -126,12 +133,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @char_array
+; FUNC-LABEL: {{^}}char_array:
 
-; R600-CHECK: MOVA_INT
+; R600: MOVA_INT
 
-; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100
-; SI-CHECK: V_MOVRELS_B32_e32
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x1 ; encoding: [0x01,0x10,0x60,0xe0
 define void @char_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = alloca [2 x i8]
@@ -149,12 +156,12 @@ entry:
 
 ; Make sure we don't overwrite workitem information with private memory
 
-; FUNC-LABEL: @work_item_info
-; R600-CHECK-NOT: MOV T0.X
+; FUNC-LABEL: {{^}}work_item_info:
+; R600-NOT: MOV T0.X
 ; Additional check in case the move ends up in the last slot
-; R600-CHECK-NOT: MOV * TO.X
+; R600-NOT: MOV * TO.X
 
-; SI-CHECK-NOT: V_MOV_B32_e{{(32|64)}} v0
+; SI-NOT: v_mov_b32_e{{(32|64)}} v0
 define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [2 x i32]
@@ -172,11 +179,11 @@ entry:
 
 ; Test that two stack objects are not stored in the same register
 ; The second stack object should be in T3.X
-; FUNC-LABEL: @no_overlap
+; FUNC-LABEL: {{^}}no_overlap:
 ; R600_CHECK: MOV
 ; R600_CHECK: [[CHAN:[XYZW]]]+
-; R600-CHECK-NOT: [[CHAN]]+
-; SI-CHECK: V_MOV_B32_e32 v3
+; R600-NOT: [[CHAN]]+
+; SI: v_mov_b32_e32 v3
 define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [3 x i8], align 1
@@ -283,3 +290,22 @@ entry:
   ret void
 }
 
+; AMDGPUPromoteAlloca does not know how to handle ptrtoint.  When it
+; finds one, it should stop trying to promote.
+
+; FUNC-LABEL: ptrtoint:
+; SI-NOT: ds_write
+; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x5
+define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %alloca = alloca [16 x i32]
+  %tmp0 = getelementptr [16 x i32]* %alloca, i32 0, i32 %a
+  store i32 5, i32* %tmp0
+  %tmp1 = ptrtoint [16 x i32]* %alloca to i32
+  %tmp2 = add i32 %tmp1, 5
+  %tmp3 = inttoptr i32 %tmp2 to i32*
+  %tmp4 = getelementptr i32* %tmp3, i32 %b
+  %tmp5 = load i32* %tmp4
+  store i32 %tmp5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
index 55eb56d..1908f15 100644
--- a/test/CodeGen/R600/pv.ll
+++ b/test/CodeGen/R600/pv.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 | FileCheck %s
 
-;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
-;CHECK: MAX T{{[0-9].[XYZW]}}, PV.X, 0.0
+; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
+; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
 
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
 main_body:
diff --git a/test/CodeGen/R600/r600-encoding.ll b/test/CodeGen/R600/r600-encoding.ll
index b760c88..112cdac 100644
--- a/test/CodeGen/R600/r600-encoding.ll
+++ b/test/CodeGen/R600/r600-encoding.ll
@@ -4,10 +4,10 @@
 ; The earliest R600 GPUs have a slightly different encoding than the rest of
 ; the VLIW4/5 GPUs.
 
-; EG-CHECK: @test
+; EG-CHECK: {{^}}test:
 ; EG-CHECK: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}]
 
-; R600-CHECK: @test
+; R600-CHECK: {{^}}test:
 ; R600-CHECK: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
 
 define void @test(<4 x float> inreg %reg0) #0 {
diff --git a/test/CodeGen/R600/r600-export-fix.ll b/test/CodeGen/R600/r600-export-fix.ll
index 73bc063..7d72856 100644
--- a/test/CodeGen/R600/r600-export-fix.ll
+++ b/test/CodeGen/R600/r600-export-fix.ll
@@ -3,9 +3,9 @@
 ;CHECK:	EXPORT T{{[0-9]}}.XYZW
 ;CHECK:	EXPORT T{{[0-9]}}.0000
 ;CHECK: EXPORT T{{[0-9]}}.0000
-;CHECK: EXPORT T{{[0-9]}}.0XZW
+;CHECK: EXPORT T{{[0-9]}}.0XYZ
 ;CHECK: EXPORT T{{[0-9]}}.XYZW
-;CHECK: EXPORT T{{[0-9]}}.YX00
+;CHECK: EXPORT T{{[0-9]}}.YZ00
 ;CHECK: EXPORT T{{[0-9]}}.0000
 ;CHECK: EXPORT T{{[0-9]}}.0000
 
diff --git a/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll
index c89398f..f388f8f 100644
--- a/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll
+++ b/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll
@@ -1,5 +1,4 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman
-;REQUIRES: asserts
 
 define void @main(<4 x float> inreg, <4 x float> inreg) #0 {
 main_body:
diff --git a/test/CodeGen/R600/r600cfg.ll b/test/CodeGen/R600/r600cfg.ll
index 6dee3ef..dddc9de 100644
--- a/test/CodeGen/R600/r600cfg.ll
+++ b/test/CodeGen/R600/r600cfg.ll
@@ -1,5 +1,4 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood
-;REQUIRES: asserts
 
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
diff --git a/test/CodeGen/R600/register-count-comments.ll b/test/CodeGen/R600/register-count-comments.ll
index 329077c..61d1b5e 100644
--- a/test/CodeGen/R600/register-count-comments.ll
+++ b/test/CodeGen/R600/register-count-comments.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
 
-; SI-LABEL: @foo:
+; SI-LABEL: {{^}}foo:
 ; SI: .section	.AMDGPU.csdata
 ; SI: ; Kernel info:
 ; SI: ; NumSgprs: {{[0-9]+}}
@@ -18,3 +18,10 @@ define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 a
   store i32 %result, i32 addrspace(1)* %outptr, align 4
   ret void
 }
+
+; SI-LABEL: {{^}}one_vgpr_used:
+; SI: NumVgprs: 1
+define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
+  store i32 %x, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/reorder-stores.ll b/test/CodeGen/R600/reorder-stores.ll
index be2fcc6..30c0171 100644
--- a/test/CodeGen/R600/reorder-stores.ll
+++ b/test/CodeGen/R600/reorder-stores.ll
@@ -1,15 +1,15 @@
 ; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @no_reorder_v2f64_global_load_store
-; SI: BUFFER_LOAD_DWORDX2
-; SI: BUFFER_LOAD_DWORDX2
-; SI: BUFFER_LOAD_DWORDX2
-; SI: BUFFER_LOAD_DWORDX2
-; SI: BUFFER_STORE_DWORDX2
-; SI: BUFFER_STORE_DWORDX2
-; SI: BUFFER_STORE_DWORDX2
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
 define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <2 x double> addrspace(1)* %x, align 16
   %tmp4 = load <2 x double> addrspace(1)* %y, align 16
@@ -18,12 +18,12 @@ define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocap
   ret void
 }
 
-; SI-LABEL: @no_reorder_scalarized_v2f64_local_load_store
-; SI: DS_READ_B64
-; SI: DS_READ_B64
-; SI: DS_WRITE_B64
-; SI: DS_WRITE_B64
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store:
+; SI: ds_read_b64
+; SI: ds_read_b64
+; SI: ds_write_b64
+; SI: ds_write_b64
+; SI: s_endpgm
 define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x double> addrspace(3)* %x, align 16
   %tmp4 = load <2 x double> addrspace(3)* %y, align 16
@@ -32,48 +32,48 @@ define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace
   ret void
 }
 
-; SI-LABEL: @no_reorder_split_v8i32_global_load_store
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
+; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
 
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
 
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
 
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
 
 
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
 
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
 
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
 
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: S_ENDPGM
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: s_endpgm
 define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <8 x i32> addrspace(1)* %x, align 32
   %tmp4 = load <8 x i32> addrspace(1)* %y, align 32
@@ -82,13 +82,13 @@ define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* no
   ret void
 }
 
-; SI-LABEL: @no_reorder_extload_64
-; SI: DS_READ_B64
-; SI: DS_READ_B64
-; SI: DS_WRITE_B64
-; SI-NOT: DS_READ
-; SI: DS_WRITE_B64
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}no_reorder_extload_64:
+; SI: ds_read_b64
+; SI: ds_read_b64
+; SI: ds_write_b64
+; SI-NOT: ds_read
+; SI: ds_write_b64
+; SI: s_endpgm
 define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x i32> addrspace(3)* %x, align 8
   %tmp4 = load <2 x i32> addrspace(3)* %y, align 8
diff --git a/test/CodeGen/R600/rotl.i64.ll b/test/CodeGen/R600/rotl.i64.ll
index bda0b66..84a35b6 100644
--- a/test/CodeGen/R600/rotl.i64.ll
+++ b/test/CodeGen/R600/rotl.i64.ll
@@ -1,10 +1,11 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @s_rotl_i64:
-; SI: S_LSHL_B64
-; SI: S_SUB_I32
-; SI: S_LSHR_B64
-; SI: S_OR_B64
+; FUNC-LABEL: {{^}}s_rotl_i64:
+; SI-DAG: s_lshl_b64
+; SI-DAG: s_sub_i32
+; SI-DAG: s_lshr_b64
+; SI: s_or_b64
+; SI: s_endpgm
 define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %0 = shl i64 %x, %y
@@ -15,12 +16,13 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @v_rotl_i64:
-; SI: V_LSHL_B64
-; SI: V_SUB_I32
-; SI: V_LSHR_B64
-; SI: V_OR_B32
-; SI: V_OR_B32
+; FUNC-LABEL: {{^}}v_rotl_i64:
+; SI-DAG: v_lshl_b64
+; SI-DAG: v_sub_i32
+; SI: v_lshr_b64
+; SI: v_or_b32
+; SI: v_or_b32
+; SI: s_endpgm
 define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64 addrspace(1)* %xptr, align 8
diff --git a/test/CodeGen/R600/rotl.ll b/test/CodeGen/R600/rotl.ll
index 83f657f..6c8e503 100644
--- a/test/CodeGen/R600/rotl.ll
+++ b/test/CodeGen/R600/rotl.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @rotl_i32:
+; FUNC-LABEL: {{^}}rotl_i32:
 ; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
 ; R600-NEXT: 32
 ; R600: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
 
-; SI: S_SUB_I32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
-; SI: V_MOV_B32_e32 [[VDST:v[0-9]+]], [[SDST]]
-; SI: V_ALIGNBIT_B32 {{v[0-9]+, [s][0-9]+, v[0-9]+}}, [[VDST]]
+; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
+; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]]
+; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]]
 define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %0 = shl i32 %x, %y
@@ -19,11 +19,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @rotl_v2i32
-; SI: S_SUB_I32
-; SI: V_ALIGNBIT_B32
-; SI: S_SUB_I32
-; SI: V_ALIGNBIT_B32
+; FUNC-LABEL: {{^}}rotl_v2i32:
+; SI-DAG: s_sub_i32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI: s_endpgm
 define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %0 = shl <2 x i32> %x, %y
@@ -34,15 +35,16 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @rotl_v4i32
-; SI: S_SUB_I32
-; SI: V_ALIGNBIT_B32
-; SI: S_SUB_I32
-; SI: V_ALIGNBIT_B32
-; SI: S_SUB_I32
-; SI: V_ALIGNBIT_B32
-; SI: S_SUB_I32
-; SI: V_ALIGNBIT_B32
+; FUNC-LABEL: {{^}}rotl_v4i32:
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI: s_endpgm
 define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %0 = shl <4 x i32> %x, %y
diff --git a/test/CodeGen/R600/rotr.i64.ll b/test/CodeGen/R600/rotr.i64.ll
index c264751..9e14570 100644
--- a/test/CodeGen/R600/rotr.i64.ll
+++ b/test/CodeGen/R600/rotr.i64.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @s_rotr_i64
-; SI: S_LSHR_B64
-; SI: S_SUB_I32
-; SI: S_LSHL_B64
-; SI: S_OR_B64
+; FUNC-LABEL: {{^}}s_rotr_i64:
+; SI-DAG: s_sub_i32
+; SI-DAG: s_lshr_b64
+; SI-DAG: s_lshl_b64
+; SI: s_or_b64
 define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %tmp0 = sub i64 64, %y
@@ -15,12 +15,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @v_rotr_i64
-; SI: V_LSHR_B64
-; SI: V_SUB_I32
-; SI: V_LSHL_B64
-; SI: V_OR_B32
-; SI: V_OR_B32
+; FUNC-LABEL: {{^}}v_rotr_i64:
+; SI-DAG: v_sub_i32
+; SI-DAG: v_lshr_b64
+; SI-DAG: v_lshl_b64
+; SI: v_or_b32
+; SI: v_or_b32
 define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64 addrspace(1)* %xptr, align 8
@@ -33,7 +33,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @s_rotr_v2i64
+; FUNC-LABEL: {{^}}s_rotr_v2i64:
 define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
 entry:
   %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
@@ -44,7 +44,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @v_rotr_v2i64
+; FUNC-LABEL: {{^}}v_rotr_v2i64:
 define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
 entry:
   %x = load <2 x i64> addrspace(1)* %xptr, align 8
diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll
index a5a4da4..a1add11 100644
--- a/test/CodeGen/R600/rotr.ll
+++ b/test/CodeGen/R600/rotr.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @rotr_i32:
+; FUNC-LABEL: {{^}}rotr_i32:
 ; R600: BIT_ALIGN_INT
 
-; SI: V_ALIGNBIT_B32
+; SI: v_alignbit_b32
 define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %tmp0 = sub i32 32, %y
@@ -15,12 +15,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @rotr_v2i32:
+; FUNC-LABEL: {{^}}rotr_v2i32:
 ; R600: BIT_ALIGN_INT
 ; R600: BIT_ALIGN_INT
 
-; SI: V_ALIGNBIT_B32
-; SI: V_ALIGNBIT_B32
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
 define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
@@ -31,16 +31,16 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @rotr_v4i32:
+; FUNC-LABEL: {{^}}rotr_v4i32:
 ; R600: BIT_ALIGN_INT
 ; R600: BIT_ALIGN_INT
 ; R600: BIT_ALIGN_INT
 ; R600: BIT_ALIGN_INT
 
-; SI: V_ALIGNBIT_B32
-; SI: V_ALIGNBIT_B32
-; SI: V_ALIGNBIT_B32
-; SI: V_ALIGNBIT_B32
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
 define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
diff --git a/test/CodeGen/R600/rsq.ll b/test/CodeGen/R600/rsq.ll
index 87c0570..d792c9f 100644
--- a/test/CodeGen/R600/rsq.ll
+++ b/test/CodeGen/R600/rsq.ll
@@ -1,11 +1,12 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
 
 declare float @llvm.sqrt.f32(float) nounwind readnone
 declare double @llvm.sqrt.f64(double) nounwind readnone
 
-; SI-LABEL: @rsq_f32
-; SI: V_RSQ_F32_e32
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}rsq_f32:
+; SI: v_rsq_f32_e32
+; SI: s_endpgm
 define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
@@ -14,9 +15,10 @@ define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noali
   ret void
 }
 
-; SI-LABEL: @rsq_f64
-; SI: V_RSQ_F64_e32
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}rsq_f64:
+; SI-UNSAFE: v_rsq_f64_e32
+; SI-SAFE: v_sqrt_f64_e32
+; SI: s_endpgm
 define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double addrspace(1)* %in, align 4
   %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone
@@ -24,3 +26,13 @@ define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noa
   store double %div, double addrspace(1)* %out, align 4
   ret void
 }
+
+; SI-LABEL: {{^}}rsq_f32_sgpr:
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; SI: s_endpgm
+define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
+  %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
+  %div = fdiv float 1.0, %sqrt
+  store float %div, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/s_movk_i32.ll b/test/CodeGen/R600/s_movk_i32.ll
new file mode 100644
index 0000000..71f9a41
--- /dev/null
+++ b/test/CodeGen/R600/s_movk_i32.ll
@@ -0,0 +1,184 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_movk_i32_k0:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k1:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k2:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 64{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k3:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k4:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k5:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k6:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 63{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k7:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}}
+; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+
+; SI-LABEL: {{^}}s_movk_i32_k8:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k9:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k10:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k11:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k12:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/saddo.ll b/test/CodeGen/R600/saddo.ll
index c80480e..654967c 100644
--- a/test/CodeGen/R600/saddo.ll
+++ b/test/CodeGen/R600/saddo.ll
@@ -4,7 +4,7 @@
 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
 
-; FUNC-LABEL: @saddo_i64_zext
+; FUNC-LABEL: {{^}}saddo_i64_zext:
 define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
@@ -15,7 +15,7 @@ define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @s_saddo_i32
+; FUNC-LABEL: {{^}}s_saddo_i32:
 define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %sadd, 0
@@ -25,7 +25,7 @@ define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
   ret void
 }
 
-; FUNC-LABEL: @v_saddo_i32
+; FUNC-LABEL: {{^}}v_saddo_i32:
 define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32 addrspace(1)* %aptr, align 4
   %b = load i32 addrspace(1)* %bptr, align 4
@@ -37,7 +37,7 @@ define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
   ret void
 }
 
-; FUNC-LABEL: @s_saddo_i64
+; FUNC-LABEL: {{^}}s_saddo_i64:
 define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
@@ -47,9 +47,9 @@ define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
   ret void
 }
 
-; FUNC-LABEL: @v_saddo_i64
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
+; FUNC-LABEL: {{^}}v_saddo_i64:
+; SI: v_add_i32
+; SI: v_addc_u32
 define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64 addrspace(1)* %aptr, align 4
   %b = load i64 addrspace(1)* %bptr, align 4
diff --git a/test/CodeGen/R600/salu-to-valu.ll b/test/CodeGen/R600/salu-to-valu.ll
index e7719b6..23af3e4 100644
--- a/test/CodeGen/R600/salu-to-valu.ll
+++ b/test/CodeGen/R600/salu-to-valu.ll
@@ -7,15 +7,15 @@
 ; sgpr register pair and use that for the pointer operand
 ; (low 64-bits of srsrc).
 
-; CHECK-LABEL: @mubuf
+; CHECK-LABEL: {{^}}mubuf:
 
-; Make sure we aren't using VGPRs for the source operand of S_MOV_B64
-; CHECK-NOT: S_MOV_B64 s[{{[0-9]+:[0-9]+}}], v
+; Make sure we aren't using VGPRs for the source operand of s_mov_b64
+; CHECK-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v
 
 ; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
 ; instructions
-; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
-; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
 define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #1
@@ -49,9 +49,9 @@ attributes #1 = { nounwind readnone }
 
 ; Test moving an SMRD instruction to the VALU
 
-; CHECK-LABEL: @smrd_valu
-; CHECK: BUFFER_LOAD_DWORD [[OUT:v[0-9]+]]
-; CHECK: BUFFER_STORE_DWORD [[OUT]]
+; CHECK-LABEL: {{^}}smrd_valu:
+; CHECK: buffer_load_dword [[OUT:v[0-9]+]]
+; CHECK: buffer_store_dword [[OUT]]
 
 define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) {
 entry:
@@ -77,8 +77,8 @@ endif:
 
 ; Test moving ann SMRD with an immediate offset to the VALU
 
-; CHECK-LABEL: @smrd_valu2
-; CHECK: BUFFER_LOAD_DWORD
+; CHECK-LABEL: {{^}}smrd_valu2:
+; CHECK: buffer_load_dword
 define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -88,3 +88,31 @@ entry:
   store i32 %3, i32 addrspace(1)* %out
   ret void
 }
+
+; CHECK-LABEL: {{^}}s_load_imm_v8i32:
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
+entry:
+  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tmp1 = getelementptr inbounds i32 addrspace(2)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
+  %tmp3 = load <8 x i32> addrspace(2)* %tmp2, align 4
+  store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; CHECK-LABEL: {{^}}s_load_imm_v16i32:
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
+entry:
+  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tmp1 = getelementptr inbounds i32 addrspace(2)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
+  %tmp3 = load <16 x i32> addrspace(2)* %tmp2, align 4
+  store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/R600/scalar_to_vector.ll b/test/CodeGen/R600/scalar_to_vector.ll
index bcccb06..dc9ebe0 100644
--- a/test/CodeGen/R600/scalar_to_vector.ll
+++ b/test/CodeGen/R600/scalar_to_vector.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 
-; FUNC-LABEL: @scalar_to_vector_v2i32
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_LSHRREV_B32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}scalar_to_vector_v2i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: s_endpgm
 define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tmp1 = load i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %tmp1 to <2 x i16>
@@ -17,14 +17,14 @@ define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(
   ret void
 }
 
-; FUNC-LABEL: @scalar_to_vector_v2f32
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_LSHRREV_B32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}scalar_to_vector_v2f32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: s_endpgm
 define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tmp1 = load float addrspace(1)* %in, align 4
   %bc = bitcast float %tmp1 to <2 x i16>
diff --git a/test/CodeGen/R600/schedule-global-loads.ll b/test/CodeGen/R600/schedule-global-loads.ll
new file mode 100644
index 0000000..5422ca7
--- /dev/null
+++ b/test/CodeGen/R600/schedule-global-loads.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FIXME: This currently doesn't do a great job of clustering the
+; loads, which end up with extra moves between them. Right now, it
+; seems the only things areLoadsFromSameBasePtr is accomplishing is
+; ordering the loads so that the lower address loads come first.
+
+; FUNC-LABEL: {{^}}cluster_global_arg_loads:
+; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:0x4
+; SI: buffer_store_dword [[REG0]]
+; SI: buffer_store_dword [[REG1]]
+define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
+  %load0 = load i32 addrspace(1)* %ptr, align 4
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 1
+  %load1 = load i32 addrspace(1)* %gep, align 4
+  store i32 %load0, i32 addrspace(1)* %out0, align 4
+  store i32 %load1, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; Test for a crach in SIInstrInfo::areLoadsFromSameBasePtr() when checking
+; an MUBUF load which does not have a vaddr operand.
+; FUNC-LABEL: {{^}}same_base_ptr_crash:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+entry:
+  %out1 = getelementptr i32 addrspace(1)* %out, i32 %offset
+  %tmp0 = load i32 addrspace(1)* %out
+  %tmp1 = load i32 addrspace(1)* %out1
+  %tmp2 = add i32 %tmp0, %tmp1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/schedule-kernel-arg-loads.ll b/test/CodeGen/R600/schedule-kernel-arg-loads.ll
new file mode 100644
index 0000000..e774157
--- /dev/null
+++ b/test/CodeGen/R600/schedule-kernel-arg-loads.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+
+; FUNC-LABEL: {{^}}cluster_arg_loads:
+; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
+define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
+  store i32 %x, i32 addrspace(1)* %out0, align 4
+  store i32 %y, i32 addrspace(1)* %out1, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
index 3d2142d..baac5b5 100644
--- a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
+++ b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
@@ -5,7 +5,7 @@
 declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
 
 
-; SI-LABEL: @main(
+; SI-LABEL: {{^}}main(
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0
diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll
index e922d5c..16853e0 100644
--- a/test/CodeGen/R600/sdiv.ll
+++ b/test/CodeGen/R600/sdiv.ll
@@ -10,7 +10,7 @@
 ; This was fixed by adding an additional pattern in R600Instructions.td to
 ; match this pattern with a CNDGE_INT.
 
-; FUNC-LABEL: @sdiv_i32
+; FUNC-LABEL: {{^}}sdiv_i32:
 ; EG: CF_END
 define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
@@ -21,7 +21,7 @@ define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
-; FUNC-LABEL: @sdiv_i32_4
+; FUNC-LABEL: {{^}}sdiv_i32_4:
 define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32 addrspace(1) * %in
   %result = sdiv i32 %num, 4
@@ -32,16 +32,16 @@ define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; Multiply by a weird constant to make sure setIntDivIsCheap is
 ; working.
 
-; FUNC-LABEL: @slow_sdiv_i32_3435
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_MOV_B32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
-; SI: V_MUL_HI_I32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]]
-; SI: V_ADD_I32
-; SI: V_LSHRREV_B32
-; SI: V_ASHRREV_I32
-; SI: V_ADD_I32
-; SI: BUFFER_STORE_DWORD
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
+; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]]
+; SI: v_add_i32
+; SI: v_lshrrev_b32
+; SI: v_ashrrev_i32
+; SI: v_add_i32
+; SI: buffer_store_dword
+; SI: s_endpgm
 define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32 addrspace(1) * %in
   %result = sdiv i32 %num, 3435
diff --git a/test/CodeGen/R600/sdivrem24.ll b/test/CodeGen/R600/sdivrem24.ll
new file mode 100644
index 0000000..228cf76
--- /dev/null
+++ b/test/CodeGen/R600/sdivrem24.ll
@@ -0,0 +1,238 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}sdiv24_i8:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
+  %num = load i8 addrspace(1) * %in
+  %den = load i8 addrspace(1) * %den_ptr
+  %result = sdiv i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv24_i16:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
+  %num = load i16 addrspace(1) * %in, align 2
+  %den = load i16 addrspace(1) * %den_ptr, align 2
+  %result = sdiv i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv24_i32:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv25_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_sdiv24_i32_1:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_sdiv24_i32_2:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem24_i8:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
+  %num = load i8 addrspace(1) * %in
+  %den = load i8 addrspace(1) * %den_ptr
+  %result = srem i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem24_i16:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
+  %num = load i16 addrspace(1) * %in, align 2
+  %den = load i16 addrspace(1) * %den_ptr, align 2
+  %result = srem i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem24_i32:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_srem24_i32_1:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_srem24_i32_2:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/select-i1.ll b/test/CodeGen/R600/select-i1.ll
new file mode 100644
index 0000000..2e2d0e4
--- /dev/null
+++ b/test/CodeGen/R600/select-i1.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI
+
+; FUNC-LABEL: {{^}}select_i1:
+; SI: v_cndmask_b32
+; SI-NOT: v_cndmask_b32
+define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %sel = select i1 %cmp, i1 %a, i1 %b
+  store i1 %sel, i1 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/R600/select-vectors.ll b/test/CodeGen/R600/select-vectors.ll
index 94605fe..7d8df2e 100644
--- a/test/CodeGen/R600/select-vectors.ll
+++ b/test/CodeGen/R600/select-vectors.ll
@@ -4,11 +4,11 @@
 ; Evergreen not enabled since it seems to be having problems with doubles.
 
 
-; FUNC-LABEL: @select_v4i8
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v4i8:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
   %cmp = icmp eq i8 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
@@ -16,11 +16,11 @@ define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b,
   ret void
 }
 
-; FUNC-LABEL: @select_v4i16
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v4i16:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
@@ -28,10 +28,10 @@ define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16>
   ret void
 }
 
-; FUNC-LABEL: @select_v2i32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}select_v2i32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: buffer_store_dwordx2
 define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
@@ -39,12 +39,12 @@ define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32>
   ret void
 }
 
-; FUNC-LABEL: @select_v4i32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: BUFFER_STORE_DWORDX4
+; FUNC-LABEL: {{^}}select_v4i32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: buffer_store_dwordx4
 define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
@@ -52,15 +52,15 @@ define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32>
   ret void
 }
 
-; FUNC-LABEL: @select_v8i32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v8i32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
@@ -68,8 +68,8 @@ define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32>
   ret void
 }
 
-; FUNC-LABEL: @select_v2f32
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}select_v2f32:
+; SI: buffer_store_dwordx2
 define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
@@ -77,8 +77,8 @@ define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x f
   ret void
 }
 
-; FUNC-LABEL: @select_v4f32
-; SI: BUFFER_STORE_DWORDX4
+; FUNC-LABEL: {{^}}select_v4f32:
+; SI: buffer_store_dwordx4
 define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
@@ -86,15 +86,15 @@ define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x f
   ret void
 }
 
-; FUNC-LABEL: @select_v8f32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v8f32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
@@ -102,11 +102,11 @@ define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x f
   ret void
 }
 
-; FUNC-LABEL: @select_v2f64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v2f64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
@@ -114,15 +114,15 @@ define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x
   ret void
 }
 
-; FUNC-LABEL: @select_v4f64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v4f64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
@@ -130,23 +130,23 @@ define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x
   ret void
 }
 
-; FUNC-LABEL: @select_v8f64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v8f64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
diff --git a/test/CodeGen/R600/select.ll b/test/CodeGen/R600/select.ll
index f940142..45f3cd5 100644
--- a/test/CodeGen/R600/select.ll
+++ b/test/CodeGen/R600/select.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
 
 ; Normally icmp + select is optimized to select_cc, when this happens the
 ; DAGLegalizer never sees the select and doesn't have a chance to leaglize it.
@@ -6,13 +7,13 @@
 ; In order to avoid the select_cc optimization, this test case calculates the
 ; condition for the select in a separate basic block.
 
-; CHECK-LABEL: @select
-; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
-; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
-; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
-; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+; FUNC-LABEL: {{^}}select:
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
 define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
                      <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
                      <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,
diff --git a/test/CodeGen/R600/select64.ll b/test/CodeGen/R600/select64.ll
index 6b87d98..8de34d5 100644
--- a/test/CodeGen/R600/select64.ll
+++ b/test/CodeGen/R600/select64.ll
@@ -1,11 +1,11 @@
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
 
-; CHECK-LABEL: @select0
+; CHECK-LABEL: {{^}}select0:
 ; i64 select should be split into two i32 selects, and we shouldn't need
 ; to use a shfit to extract the hi dword of the input.
-; CHECK-NOT: S_LSHR_B64
-; CHECK: V_CNDMASK
-; CHECK: V_CNDMASK
+; CHECK-NOT: s_lshr_b64
+; CHECK: v_cndmask
+; CHECK: v_cndmask
 define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
 entry:
   %0 = icmp ugt i32 %cond, 5
@@ -13,3 +13,38 @@ entry:
   store i64 %1, i64 addrspace(1)* %out
   ret void
 }
+
+; CHECK-LABEL: {{^}}select_trunc_i64:
+; CHECK: v_cndmask_b32
+; CHECK-NOT: v_cndmask_b32
+define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %sel = select i1 %cmp, i64 0, i64 %in
+  %trunc = trunc i64 %sel to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}select_trunc_i64_2:
+; CHECK: v_cndmask_b32
+; CHECK-NOT: v_cndmask_b32
+define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  %trunc = trunc i64 %sel to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_select_trunc_i64_2:
+; CHECK: v_cndmask_b32
+; CHECK-NOT: v_cndmask_b32
+define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %b = load i64 addrspace(1)* %bptr, align 8
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  %trunc = trunc i64 %sel to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc-opt.ll b/test/CodeGen/R600/selectcc-opt.ll
index 834c030..82577bb 100644
--- a/test/CodeGen/R600/selectcc-opt.ll
+++ b/test/CodeGen/R600/selectcc-opt.ll
@@ -1,8 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-; CHECK: @test_a
-; CHECK-NOT: CND
-; CHECK: SET{{[NEQGTL]+}}_DX10
+
+; FUNC-LABEL: {{^}}test_a:
+; EG-NOT: CND
+; EG: SET{{[NEQGTL]+}}_DX10
 
 define void @test_a(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -28,10 +30,10 @@ ENDIF:
 ; Same as test_a, but the branch labels are swapped to produce the inverse cc
 ; for the icmp instruction
 
-; CHECK: @test_b
-; CHECK: SET{{[GTEQN]+}}_DX10
-; CHECK-NEXT: PRED_
-; CHECK-NEXT: ALU clause starting
+; EG-LABEL: {{^}}test_b:
+; EG: SET{{[GTEQN]+}}_DX10
+; EG-NEXT: PRED_
+; EG-NEXT: ALU clause starting
 define void @test_b(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.0
@@ -54,8 +56,8 @@ ENDIF:
 }
 
 ; Test a CND*_INT instruction with float true/false values
-; CHECK: @test_c
-; CHECK: CND{{[GTE]+}}_INT
+; EG-LABEL: {{^}}test_c:
+; EG: CND{{[GTE]+}}_INT
 define void @test_c(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
@@ -63,3 +65,15 @@ entry:
   store float %1, float addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}selectcc_bool:
+; SI: v_cmp_ne_i32
+; SI-NEXT: v_cndmask_b32_e64
+; SI-NOT: cmp
+; SI-NOT: cndmask
+define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = select i1 %icmp0, i32 -1, i32 0
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc.ll b/test/CodeGen/R600/selectcc.ll
index a8f57cf..5a09b5c 100644
--- a/test/CodeGen/R600/selectcc.ll
+++ b/test/CodeGen/R600/selectcc.ll
@@ -1,15 +1,15 @@
 ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @selectcc_i64
+; FUNC-LABEL: {{^}}selectcc_i64:
 ; EG: XOR_INT
 ; EG: XOR_INT
 ; EG: OR_INT
 ; EG: CNDE_INT
 ; EG: CNDE_INT
-; SI: V_CMP_EQ_I64
-; SI: V_CNDMASK
-; SI: V_CNDMASK
+; SI: v_cmp_eq_i64
+; SI: v_cndmask
+; SI: v_cndmask
 define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
 entry:
   %0 = icmp eq i64 %lhs, %rhs
diff --git a/test/CodeGen/R600/set-dx10.ll b/test/CodeGen/R600/set-dx10.ll
index 5c7d499..53694dc 100644
--- a/test/CodeGen/R600/set-dx10.ll
+++ b/test/CodeGen/R600/set-dx10.ll
@@ -4,7 +4,7 @@
 ; to store integer true (-1) and false (0) values are lowered to one of the
 ; SET*DX10 instructions.
 
-; CHECK: @fcmp_une_select_fptosi
+; CHECK: {{^}}fcmp_une_select_fptosi:
 ; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -18,7 +18,7 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_une_select_i32
+; CHECK: {{^}}fcmp_une_select_i32:
 ; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -30,7 +30,7 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_oeq_select_fptosi
+; CHECK: {{^}}fcmp_oeq_select_fptosi:
 ; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -44,7 +44,7 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_oeq_select_i32
+; CHECK: {{^}}fcmp_oeq_select_i32:
 ; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -56,7 +56,7 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ogt_select_fptosi
+; CHECK: {{^}}fcmp_ogt_select_fptosi:
 ; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -70,7 +70,7 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ogt_select_i32
+; CHECK: {{^}}fcmp_ogt_select_i32:
 ; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -82,7 +82,7 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_oge_select_fptosi
+; CHECK: {{^}}fcmp_oge_select_fptosi:
 ; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -96,7 +96,7 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_oge_select_i32
+; CHECK: {{^}}fcmp_oge_select_i32:
 ; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -108,7 +108,7 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ole_select_fptosi
+; CHECK: {{^}}fcmp_ole_select_fptosi:
 ; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -122,7 +122,7 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ole_select_i32
+; CHECK: {{^}}fcmp_ole_select_i32:
 ; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -134,7 +134,7 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_olt_select_fptosi
+; CHECK: {{^}}fcmp_olt_select_fptosi:
 ; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -148,7 +148,7 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_olt_select_i32
+; CHECK: {{^}}fcmp_olt_select_i32:
 ; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
diff --git a/test/CodeGen/R600/setcc-equivalent.ll b/test/CodeGen/R600/setcc-equivalent.ll
index f796748..11ea793 100644
--- a/test/CodeGen/R600/setcc-equivalent.ll
+++ b/test/CodeGen/R600/setcc-equivalent.ll
@@ -1,7 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
-; XFAIL: *
 
-; EG-LABEL: @and_setcc_setcc_i32
+; EG-LABEL: {{^}}and_setcc_setcc_i32:
 ; EG: AND_INT
 ; EG-NEXT: SETE_INT
 define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
@@ -13,7 +12,7 @@ define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   ret void
 }
 
-; EG-LABEL: @and_setcc_setcc_v4i32
+; EG-LABEL: {{^}}and_setcc_setcc_v4i32:
 ; EG: AND_INT
 ; EG: AND_INT
 ; EG: SETE_INT
@@ -28,4 +27,4 @@ define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
   %ext = sext <4 x i1> %and to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out, align 4
   ret void
-}
-\ No newline at end of file
+}
diff --git a/test/CodeGen/R600/setcc-opt.ll b/test/CodeGen/R600/setcc-opt.ll
new file mode 100644
index 0000000..af48df8
--- /dev/null
+++ b/test/CodeGen/R600/setcc-opt.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; SI-LABEL: {{^}}sext_bool_icmp_ne:
+; SI: v_cmp_ne_i32
+; SI-NEXT: v_cndmask_b32
+; SI-NOT: v_cmp_ne_i32
+; SI-NOT: v_cndmask_b32
+; SI: s_endpgm
+define void @sext_bool_icmp_ne(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index 5bd95b7..8dd2ce4 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
 ;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
-; FUNC-LABEL: @setcc_v2i32
+; FUNC-LABEL: {{^}}setcc_v2i32:
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
 
@@ -12,7 +12,7 @@ define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %
   ret void
 }
 
-; FUNC-LABEL: @setcc_v4i32
+; FUNC-LABEL: {{^}}setcc_v4i32:
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -32,9 +32,9 @@ define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 ;; Float comparisons
 ;;;==========================================================================;;;
 
-; FUNC-LABEL: @f32_oeq
+; FUNC-LABEL: {{^}}f32_oeq:
 ; R600: SETE_DX10
-; SI: V_CMP_EQ_F32
+; SI: v_cmp_eq_f32
 define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp oeq float %a, %b
@@ -43,9 +43,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_ogt
+; FUNC-LABEL: {{^}}f32_ogt:
 ; R600: SETGT_DX10
-; SI: V_CMP_GT_F32
+; SI: v_cmp_gt_f32
 define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ogt float %a, %b
@@ -54,9 +54,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_oge
+; FUNC-LABEL: {{^}}f32_oge:
 ; R600: SETGE_DX10
-; SI: V_CMP_GE_F32
+; SI: v_cmp_ge_f32
 define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp oge float %a, %b
@@ -65,9 +65,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_olt
+; FUNC-LABEL: {{^}}f32_olt:
 ; R600: SETGT_DX10
-; SI: V_CMP_LT_F32
+; SI: v_cmp_lt_f32
 define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp olt float %a, %b
@@ -76,9 +76,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_ole
+; FUNC-LABEL: {{^}}f32_ole:
 ; R600: SETGE_DX10
-; SI: V_CMP_LE_F32
+; SI: v_cmp_le_f32
 define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ole float %a, %b
@@ -87,18 +87,18 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_one
+; FUNC-LABEL: {{^}}f32_one:
 ; R600-DAG: SETE_DX10
 ; R600-DAG: SETE_DX10
 ; R600-DAG: AND_INT
 ; R600-DAG: SETNE_DX10
 ; R600-DAG: AND_INT
 ; R600-DAG: SETNE_INT
-; SI: V_CMP_O_F32
-; SI: V_CMP_NEQ_F32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_AND_B32_e32
+; SI: v_cmp_o_f32
+; SI: v_cmp_neq_f32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_and_b32_e32
 define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp one float %a, %b
@@ -107,12 +107,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_ord
+; FUNC-LABEL: {{^}}f32_ord:
 ; R600-DAG: SETE_DX10
 ; R600-DAG: SETE_DX10
 ; R600-DAG: AND_INT
 ; R600-DAG: SETNE_INT
-; SI: V_CMP_O_F32
+; SI: v_cmp_o_f32
 define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ord float %a, %b
@@ -121,18 +121,18 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_ueq
+; FUNC-LABEL: {{^}}f32_ueq:
 ; R600-DAG: SETNE_DX10
 ; R600-DAG: SETNE_DX10
 ; R600-DAG: OR_INT
 ; R600-DAG: SETE_DX10
 ; R600-DAG: OR_INT
 ; R600-DAG: SETNE_INT
-; SI: V_CMP_U_F32
-; SI: V_CMP_EQ_F32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; SI: v_cmp_u_f32
+; SI: v_cmp_eq_f32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ueq float %a, %b
@@ -141,14 +141,14 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_ugt
+; FUNC-LABEL: {{^}}f32_ugt:
 ; R600: SETGE
 ; R600: SETE_DX10
-; SI: V_CMP_U_F32
-; SI: V_CMP_GT_F32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; SI: v_cmp_u_f32
+; SI: v_cmp_gt_f32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ugt float %a, %b
@@ -157,14 +157,14 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_uge
+; FUNC-LABEL: {{^}}f32_uge:
 ; R600: SETGT
 ; R600: SETE_DX10
-; SI: V_CMP_U_F32
-; SI: V_CMP_GE_F32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; SI: v_cmp_u_f32
+; SI: v_cmp_ge_f32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp uge float %a, %b
@@ -173,14 +173,14 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_ult
+; FUNC-LABEL: {{^}}f32_ult:
 ; R600: SETGE
 ; R600: SETE_DX10
-; SI: V_CMP_U_F32
-; SI: V_CMP_LT_F32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; SI: v_cmp_u_f32
+; SI: v_cmp_lt_f32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ult float %a, %b
@@ -189,14 +189,14 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_ule
+; FUNC-LABEL: {{^}}f32_ule:
 ; R600: SETGT
 ; R600: SETE_DX10
-; SI: V_CMP_U_F32
-; SI: V_CMP_LE_F32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; SI: v_cmp_u_f32
+; SI: v_cmp_le_f32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ule float %a, %b
@@ -205,9 +205,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_une
+; FUNC-LABEL: {{^}}f32_une:
 ; R600: SETNE_DX10
-; SI: V_CMP_NEQ_F32
+; SI: v_cmp_neq_f32
 define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp une float %a, %b
@@ -216,12 +216,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f32_uno
+; FUNC-LABEL: {{^}}f32_uno:
 ; R600: SETNE_DX10
 ; R600: SETNE_DX10
 ; R600: OR_INT
 ; R600: SETNE_INT
-; SI: V_CMP_U_F32
+; SI: v_cmp_u_f32
 define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp uno float %a, %b
@@ -234,9 +234,9 @@ entry:
 ;; 32-bit integer comparisons
 ;;;==========================================================================;;;
 
-; FUNC-LABEL: @i32_eq
+; FUNC-LABEL: {{^}}i32_eq:
 ; R600: SETE_INT
-; SI: V_CMP_EQ_I32
+; SI: v_cmp_eq_i32
 define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp eq i32 %a, %b
@@ -245,9 +245,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i32_ne
+; FUNC-LABEL: {{^}}i32_ne:
 ; R600: SETNE_INT
-; SI: V_CMP_NE_I32
+; SI: v_cmp_ne_i32
 define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp ne i32 %a, %b
@@ -256,9 +256,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i32_ugt
+; FUNC-LABEL: {{^}}i32_ugt:
 ; R600: SETGT_UINT
-; SI: V_CMP_GT_U32
+; SI: v_cmp_gt_u32
 define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp ugt i32 %a, %b
@@ -267,9 +267,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i32_uge
+; FUNC-LABEL: {{^}}i32_uge:
 ; R600: SETGE_UINT
-; SI: V_CMP_GE_U32
+; SI: v_cmp_ge_u32
 define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp uge i32 %a, %b
@@ -278,9 +278,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i32_ult
+; FUNC-LABEL: {{^}}i32_ult:
 ; R600: SETGT_UINT
-; SI: V_CMP_LT_U32
+; SI: v_cmp_lt_u32
 define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp ult i32 %a, %b
@@ -289,9 +289,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i32_ule
+; FUNC-LABEL: {{^}}i32_ule:
 ; R600: SETGE_UINT
-; SI: V_CMP_LE_U32
+; SI: v_cmp_le_u32
 define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp ule i32 %a, %b
@@ -300,9 +300,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i32_sgt
+; FUNC-LABEL: {{^}}i32_sgt:
 ; R600: SETGT_INT
-; SI: V_CMP_GT_I32
+; SI: v_cmp_gt_i32
 define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp sgt i32 %a, %b
@@ -311,9 +311,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i32_sge
+; FUNC-LABEL: {{^}}i32_sge:
 ; R600: SETGE_INT
-; SI: V_CMP_GE_I32
+; SI: v_cmp_ge_i32
 define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp sge i32 %a, %b
@@ -322,9 +322,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i32_slt
+; FUNC-LABEL: {{^}}i32_slt:
 ; R600: SETGT_INT
-; SI: V_CMP_LT_I32
+; SI: v_cmp_lt_i32
 define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp slt i32 %a, %b
@@ -333,9 +333,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i32_sle
+; FUNC-LABEL: {{^}}i32_sle:
 ; R600: SETGE_INT
-; SI: V_CMP_LE_I32
+; SI: v_cmp_le_i32
 define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp sle i32 %a, %b
diff --git a/test/CodeGen/R600/setcc64.ll b/test/CodeGen/R600/setcc64.ll
index 54a33b3..6e43172 100644
--- a/test/CodeGen/R600/setcc64.ll
+++ b/test/CodeGen/R600/setcc64.ll
@@ -6,8 +6,8 @@
 ;; Double comparisons
 ;;;==========================================================================;;;
 
-; FUNC-LABEL: @f64_oeq
-; SI: V_CMP_EQ_F64
+; FUNC-LABEL: {{^}}f64_oeq:
+; SI: v_cmp_eq_f64
 define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp oeq double %a, %b
@@ -16,8 +16,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_ogt
-; SI: V_CMP_GT_F64
+; FUNC-LABEL: {{^}}f64_ogt:
+; SI: v_cmp_gt_f64
 define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ogt double %a, %b
@@ -26,8 +26,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_oge
-; SI: V_CMP_GE_F64
+; FUNC-LABEL: {{^}}f64_oge:
+; SI: v_cmp_ge_f64
 define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp oge double %a, %b
@@ -36,8 +36,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_olt
-; SI: V_CMP_LT_F64
+; FUNC-LABEL: {{^}}f64_olt:
+; SI: v_cmp_lt_f64
 define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp olt double %a, %b
@@ -46,8 +46,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_ole
-; SI: V_CMP_LE_F64
+; FUNC-LABEL: {{^}}f64_ole:
+; SI: v_cmp_le_f64
 define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ole double %a, %b
@@ -56,12 +56,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_one
-; SI: V_CMP_O_F64
-; SI: V_CMP_NEQ_F64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_AND_B32_e32
+; FUNC-LABEL: {{^}}f64_one:
+; SI: v_cmp_o_f64
+; SI: v_cmp_neq_f64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_and_b32_e32
 define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp one double %a, %b
@@ -70,8 +70,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_ord
-; SI: V_CMP_O_F64
+; FUNC-LABEL: {{^}}f64_ord:
+; SI: v_cmp_o_f64
 define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ord double %a, %b
@@ -80,12 +80,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_ueq
-; SI: V_CMP_U_F64
-; SI: V_CMP_EQ_F64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; FUNC-LABEL: {{^}}f64_ueq:
+; SI: v_cmp_u_f64
+; SI: v_cmp_eq_f64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ueq double %a, %b
@@ -94,12 +94,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_ugt
-; SI: V_CMP_U_F64
-; SI: V_CMP_GT_F64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; FUNC-LABEL: {{^}}f64_ugt:
+; SI: v_cmp_u_f64
+; SI: v_cmp_gt_f64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ugt double %a, %b
@@ -108,12 +108,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_uge
-; SI: V_CMP_U_F64
-; SI: V_CMP_GE_F64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; FUNC-LABEL: {{^}}f64_uge:
+; SI: v_cmp_u_f64
+; SI: v_cmp_ge_f64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp uge double %a, %b
@@ -122,12 +122,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_ult
-; SI: V_CMP_U_F64
-; SI: V_CMP_LT_F64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; FUNC-LABEL: {{^}}f64_ult:
+; SI: v_cmp_u_f64
+; SI: v_cmp_lt_f64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ult double %a, %b
@@ -136,12 +136,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_ule
-; SI: V_CMP_U_F64
-; SI: V_CMP_LE_F64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; FUNC-LABEL: {{^}}f64_ule:
+; SI: v_cmp_u_f64
+; SI: v_cmp_le_f64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ule double %a, %b
@@ -150,8 +150,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_une
-; SI: V_CMP_NEQ_F64
+; FUNC-LABEL: {{^}}f64_une:
+; SI: v_cmp_neq_f64
 define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp une double %a, %b
@@ -160,8 +160,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @f64_uno
-; SI: V_CMP_U_F64
+; FUNC-LABEL: {{^}}f64_uno:
+; SI: v_cmp_u_f64
 define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp uno double %a, %b
@@ -174,8 +174,8 @@ entry:
 ;; 64-bit integer comparisons
 ;;;==========================================================================;;;
 
-; FUNC-LABEL: @i64_eq
-; SI: V_CMP_EQ_I64
+; FUNC-LABEL: {{^}}i64_eq:
+; SI: v_cmp_eq_i64
 define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp eq i64 %a, %b
@@ -184,8 +184,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i64_ne
-; SI: V_CMP_NE_I64
+; FUNC-LABEL: {{^}}i64_ne:
+; SI: v_cmp_ne_i64
 define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp ne i64 %a, %b
@@ -194,8 +194,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i64_ugt
-; SI: V_CMP_GT_U64
+; FUNC-LABEL: {{^}}i64_ugt:
+; SI: v_cmp_gt_u64
 define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp ugt i64 %a, %b
@@ -204,8 +204,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i64_uge
-; SI: V_CMP_GE_U64
+; FUNC-LABEL: {{^}}i64_uge:
+; SI: v_cmp_ge_u64
 define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp uge i64 %a, %b
@@ -214,8 +214,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i64_ult
-; SI: V_CMP_LT_U64
+; FUNC-LABEL: {{^}}i64_ult:
+; SI: v_cmp_lt_u64
 define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp ult i64 %a, %b
@@ -224,8 +224,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i64_ule
-; SI: V_CMP_LE_U64
+; FUNC-LABEL: {{^}}i64_ule:
+; SI: v_cmp_le_u64
 define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp ule i64 %a, %b
@@ -234,8 +234,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i64_sgt
-; SI: V_CMP_GT_I64
+; FUNC-LABEL: {{^}}i64_sgt:
+; SI: v_cmp_gt_i64
 define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp sgt i64 %a, %b
@@ -244,8 +244,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i64_sge
-; SI: V_CMP_GE_I64
+; FUNC-LABEL: {{^}}i64_sge:
+; SI: v_cmp_ge_i64
 define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp sge i64 %a, %b
@@ -254,8 +254,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i64_slt
-; SI: V_CMP_LT_I64
+; FUNC-LABEL: {{^}}i64_slt:
+; SI: v_cmp_lt_i64
 define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp slt i64 %a, %b
@@ -264,8 +264,8 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @i64_sle
-; SI: V_CMP_LE_I64
+; FUNC-LABEL: {{^}}i64_sle:
+; SI: v_cmp_le_i64
 define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp sle i64 %a, %b
diff --git a/test/CodeGen/R600/seto.ll b/test/CodeGen/R600/seto.ll
index e90e788..5fe6ff6 100644
--- a/test/CodeGen/R600/seto.ll
+++ b/test/CodeGen/R600/seto.ll
@@ -1,8 +1,8 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: @main
-;CHECK: V_CMP_O_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
 
+; CHECK-LABEL: {{^}}main:
+; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
+; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
 define void @main(float %p) {
 main_body:
   %c = fcmp oeq float %p, %p
diff --git a/test/CodeGen/R600/setuo.ll b/test/CodeGen/R600/setuo.ll
index 3b1db8b..a391177 100644
--- a/test/CodeGen/R600/setuo.ll
+++ b/test/CodeGen/R600/setuo.ll
@@ -1,8 +1,8 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: @main
-;CHECK: V_CMP_U_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
 
+; CHECK-LABEL: {{^}}main:
+; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
+; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
 define void @main(float %p) {
 main_body:
   %c = fcmp une float %p, %p
diff --git a/test/CodeGen/R600/sext-eliminate.ll b/test/CodeGen/R600/sext-eliminate.ll
new file mode 100644
index 0000000..7dc6eb8
--- /dev/null
+++ b/test/CodeGen/R600/sext-eliminate.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_add:
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: SUB_INT {{[* ]*}}[[RES]]
+; EG-NOT: BFE
+define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+  %sext = sext i1 %a to i32
+  %res = add i32 %b, %sext
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_sub:
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT {{[* ]*}}[[RES]]
+; EG-NOT: BFE
+define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+  %sext = sext i1 %a to i32
+  %res = sub i32 %b, %sext
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll
index 1b02e4b..d364e6b 100644
--- a/test/CodeGen/R600/sext-in-reg.ll
+++ b/test/CodeGen/R600/sext-in-reg.ll
@@ -2,13 +2,14 @@
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 
-; FUNC-LABEL: @sext_in_reg_i1_i32
-; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
-; SI: S_BFE_I32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
-; SI: V_MOV_B32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
-; SI: BUFFER_STORE_DWORD [[EXTRACT]],
+; FUNC-LABEL: {{^}}sext_in_reg_i1_i32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
+; SI: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
+; SI: buffer_store_dword [[EXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
@@ -20,11 +21,11 @@ define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i8_to_i32
-; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: V_MOV_B32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
-; SI: BUFFER_STORE_DWORD [[VEXTRACT]],
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32:
+; SI: s_add_i32 [[VAL:s[0-9]+]],
+; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: buffer_store_dword [[VEXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: ADD_INT
@@ -38,11 +39,11 @@ define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounw
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i16_to_i32
-; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: S_SEXT_I32_I16 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: V_MOV_B32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
-; SI: BUFFER_STORE_DWORD [[VEXTRACT]],
+; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i32:
+; SI: s_add_i32 [[VAL:s[0-9]+]],
+; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: buffer_store_dword [[VEXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: ADD_INT
@@ -56,11 +57,11 @@ define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) noun
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i8_to_v1i32
-; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: V_MOV_B32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
-; SI: BUFFER_STORE_DWORD [[VEXTRACT]],
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i32:
+; SI: s_add_i32 [[VAL:s[0-9]+]],
+; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: buffer_store_dword [[VEXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: ADD_INT
@@ -74,29 +75,31 @@ define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a,
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i1_to_i64
-; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: S_BFE_I32 s{{[0-9]+}}, s{{[0-9]+}}, 0x10000
-; SI: S_MOV_B32 {{s[0-9]+}}, -1
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %c = add i64 %a, %b
+  %c = shl i64 %a, %b
   %shl = shl i64 %c, 63
   %ashr = ashr i64 %shl, 63
   store i64 %ashr, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i8_to_i64
-; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: S_MOV_B32 {{s[0-9]+}}, -1
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG: ADD_INT
-; EG-NEXT: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
+; EG: LSHL
+; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
 ; EG: ASHR [[RES_HI]]
 ; EG-NOT: BFE_INT
 ; EG: LSHR
@@ -104,23 +107,24 @@ define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw
 ;; TODO Check address computation, using | with variables in {{}} does not work,
 ;; also the _LO/_HI order might be different
 define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %c = add i64 %a, %b
+  %c = shl i64 %a, %b
   %shl = shl i64 %c, 56
   %ashr = ashr i64 %shl, 56
   store i64 %ashr, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i16_to_i64
-; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: S_SEXT_I32_I16 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: S_MOV_B32 {{s[0-9]+}}, -1
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG: ADD_INT
-; EG-NEXT: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
+; EG: LSHL
+; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
 ; EG: ASHR [[RES_HI]]
 ; EG-NOT: BFE_INT
 ; EG: LSHR
@@ -128,32 +132,32 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw
 ;; TODO Check address computation, using | with variables in {{}} does not work,
 ;; also the _LO/_HI order might be different
 define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %c = add i64 %a, %b
+  %c = shl i64 %a, %b
   %shl = shl i64 %c, 48
   %ashr = ashr i64 %shl, 48
   store i64 %ashr, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i32_to_i64
-; SI: S_LOAD_DWORD
-; SI: S_LOAD_DWORD
-; SI: S_ADD_I32 [[ADD:s[0-9]+]],
-; SI: S_ASHR_I32 s{{[0-9]+}}, [[ADD]], 31
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
 ; EG-NOT: BFE_INT
-; EG: ADD_INT {{\*?}} [[RES_LO]]
+
 ; EG: ASHR [[RES_HI]]
-; EG: ADD_INT
+
 ; EG: LSHR
 ; EG: LSHR
 ;; TODO Check address computation, using | with variables in {{}} does not work,
 ;; also the _LO/_HI order might be different
 define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %c = add i64 %a, %b
+  %c = shl i64 %a, %b
   %shl = shl i64 %c, 32
   %ashr = ashr i64 %shl, 32
   store i64 %ashr, i64 addrspace(1)* %out, align 8
@@ -161,10 +165,10 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
 }
 
 ; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
-; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64
-; XSI: S_BFE_I32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
-; XSI: S_ASHR_I32 {{v[0-9]+}}, [[EXTRACT]], 31
-; XSI: BUFFER_STORE_DWORD
+; XFUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i64:
+; XSI: s_bfe_i32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
+; XSI: s_ashr_i32 {{v[0-9]+}}, [[EXTRACT]], 31
+; XSI: buffer_store_dword
 ; XEG: BFE_INT
 ; XEG: ASHR
 ; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind {
@@ -175,10 +179,93 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
 ;   ret void
 ; }
 
-; FUNC-LABEL: @sext_in_reg_i1_in_i32_other_amount
-; SI-NOT: BFE
-; SI: S_LSHL_B32 [[REG:s[0-9]+]], {{s[0-9]+}}, 6
-; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG]], 7
+; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
+; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid
+  %a = load i64 addrspace(1)* %a.gep, align 8
+  %b = load i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 63
+  %ashr = ashr i64 %shl, 63
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8
+; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid
+  %a = load i64 addrspace(1)* %a.gep, align 8
+  %b = load i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 56
+  %ashr = ashr i64 %shl, 56
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16
+; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid
+  %a = load i64 addrspace(1)* %a.gep, align 8
+  %b = load i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 48
+  %ashr = ashr i64 %shl, 48
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}}
+define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid
+  %a = load i64 addrspace(1)* %a.gep, align 8
+  %b = load i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 32
+  %ashr = ashr i64 %shl, 32
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_lshl_b32 [[REG:s[0-9]+]], {{s[0-9]+}}, 6
+; SI: s_ashr_i32 {{s[0-9]+}}, [[REG]], 7
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG-NOT: BFE
@@ -194,11 +281,12 @@ define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a,
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_v2i1_in_v2i32_other_amount
-; SI: S_LSHL_B32 [[REG0:s[0-9]+]], {{s[0-9]}}, 6
-; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG0]], 7
-; SI: S_LSHL_B32 [[REG1:s[0-9]+]], {{s[0-9]}}, 6
-; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG1]], 7
+; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount:
+; SI-DAG: s_lshl_b32 [[REG0:s[0-9]+]], {{s[0-9]}}, 6
+; SI-DAG: s_ashr_i32 {{s[0-9]+}}, [[REG0]], 7
+; SI-DAG: s_lshl_b32 [[REG1:s[0-9]+]], {{s[0-9]}}, 6
+; SI-DAG: s_ashr_i32 {{s[0-9]+}}, [[REG1]], 7
+; SI: s_endpgm
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG-NOT: BFE
@@ -217,10 +305,10 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out
 }
 
 
-; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32
-; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i32:
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: buffer_store_dwordx2
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: BFE_INT [[RES]]
@@ -234,12 +322,12 @@ define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32
-; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: BUFFER_STORE_DWORDX4
+; FUNC-LABEL: {{^}}sext_in_reg_v4i1_to_v4i32:
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: buffer_store_dwordx4
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: BFE_INT [[RES]]
@@ -255,10 +343,10 @@ define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_v2i8_to_v2i32
-; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i32:
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: buffer_store_dwordx2
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: BFE_INT [[RES]]
@@ -272,12 +360,12 @@ define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_v4i8_to_v4i32
-; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: BUFFER_STORE_DWORDX4
+; FUNC-LABEL: {{^}}sext_in_reg_v4i8_to_v4i32:
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: buffer_store_dwordx4
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: BFE_INT [[RES]]
@@ -293,10 +381,10 @@ define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_v2i16_to_v2i32
-; SI: S_SEXT_I32_I16 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: S_SEXT_I32_I16 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_v2i16_to_v2i32:
+; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: buffer_store_dwordx2
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: BFE_INT [[RES]]
@@ -310,7 +398,7 @@ define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
   ret void
 }
 
-; FUNC-LABEL: @testcase
+; FUNC-LABEL: {{^}}testcase:
 define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind {
   %and_a_1 = and i8 %a, 1
   %cmp_eq = icmp eq i8 %and_a_1, 0
@@ -322,7 +410,7 @@ define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @testcase_3
+; FUNC-LABEL: {{^}}testcase_3:
 define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
   %and_a_1 = and i8 %a, 1
   %cmp_eq = icmp eq i8 %and_a_1, 0
@@ -334,11 +422,11 @@ define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @vgpr_sext_in_reg_v4i8_to_v4i32
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i8_to_v4i32:
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
 define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
   %loada = load <4 x i32> addrspace(1)* %a, align 16
   %loadb = load <4 x i32> addrspace(1)* %b, align 16
@@ -349,9 +437,9 @@ define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
   ret void
 }
 
-; FUNC-LABEL: @vgpr_sext_in_reg_v4i16_to_v4i32
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32:
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
 define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
   %loada = load <4 x i32> addrspace(1)* %a, align 16
   %loadb = load <4 x i32> addrspace(1)* %b, align 16
@@ -365,11 +453,11 @@ define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; FIXME: The BFE should really be eliminated. I think it should happen
 ; when computeKnownBitsForTargetNode is implemented for imax.
 
-; FUNC-LABEL: @sext_in_reg_to_illegal_type
-; SI: BUFFER_LOAD_SBYTE
-; SI: V_MAX_I32
-; SI: V_BFE_I32
-; SI: BUFFER_STORE_SHORT
+; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type:
+; SI: buffer_load_sbyte
+; SI: v_max_i32
+; SI: v_bfe_i32
+; SI: buffer_store_short
 define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
   %tmp5 = load i8 addrspace(1)* %src, align 1
   %tmp2 = sext i8 %tmp5 to i32
@@ -382,9 +470,9 @@ define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 ad
 
 declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
 
-; FUNC-LABEL: @bfe_0_width
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_0_width:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
   %load = load i32 addrspace(1)* %ptr, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
@@ -392,10 +480,10 @@ define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwin
   ret void
 }
 
-; FUNC-LABEL: @bfe_8_bfe_8
-; SI: V_BFE_I32
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_8_bfe_8:
+; SI: v_bfe_i32
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
   %load = load i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
@@ -404,9 +492,9 @@ define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwin
   ret void
 }
 
-; FUNC-LABEL: @bfe_8_bfe_16
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_8_bfe_16:
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI: s_endpgm
 define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
   %load = load i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
@@ -416,10 +504,10 @@ define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwi
 }
 
 ; This really should be folded into 1
-; FUNC-LABEL: @bfe_16_bfe_8
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_16_bfe_8:
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
   %load = load i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
@@ -429,10 +517,10 @@ define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwi
 }
 
 ; Make sure there isn't a redundant BFE
-; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe
-; SI: S_SEXT_I32_I8 s{{[0-9]+}}, s{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe:
+; SI: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
@@ -442,7 +530,7 @@ define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) n
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe_wrong
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
 define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
@@ -452,10 +540,10 @@ define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32
   ret void
 }
 
-; FUNC-LABEL: @sextload_i8_to_i32_bfe
-; SI: BUFFER_LOAD_SBYTE
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe:
+; SI: buffer_load_sbyte
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
   %load = load i8 addrspace(1)* %ptr, align 1
   %sext = sext i8 %load to i32
@@ -466,9 +554,10 @@ define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %pt
   ret void
 }
 
-; FUNC-LABEL: @sextload_i8_to_i32_bfe_0:
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:
+; SI: .text
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
   %load = load i8 addrspace(1)* %ptr, align 1
   %sext = sext i8 %load to i32
@@ -479,11 +568,11 @@ define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i1_bfe_offset_0:
-; SI-NOT: SHR
-; SI-NOT: SHL
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0:
+; SI-NOT: shr
+; SI-NOT: shl
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: s_endpgm
 define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -493,12 +582,12 @@ define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i1_bfe_offset_1
-; SI: BUFFER_LOAD_DWORD
-; SI-NOT: SHL
-; SI-NOT: SHR
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1:
+; SI: buffer_load_dword
+; SI-NOT: shl
+; SI-NOT: shr
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
+; SI: s_endpgm
 define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 30
@@ -508,12 +597,12 @@ define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i2_bfe_offset_1:
-; SI: BUFFER_LOAD_DWORD
-; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1:
+; SI: buffer_load_dword
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
+; SI: s_endpgm
 define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 30
diff --git a/test/CodeGen/R600/sgpr-control-flow.ll b/test/CodeGen/R600/sgpr-control-flow.ll
index 06ad24d..d8b8dff 100644
--- a/test/CodeGen/R600/sgpr-control-flow.ll
+++ b/test/CodeGen/R600/sgpr-control-flow.ll
@@ -4,9 +4,14 @@
 ; Most SALU instructions ignore control flow, so we need to make sure
 ; they don't overwrite values from other blocks.
 
-; SI-NOT: S_ADD
+; If the branch decision is made based on a value in an SGPR then all
+; threads will execute the same code paths, so we don't need to worry
+; about instructions in different blocks overwriting each other.
+; SI-LABEL: {{^}}sgpr_if_else_salu_br:
+; SI: s_add
+; SI: s_add
 
-define void @sgpr_if_else(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -25,3 +30,35 @@ endif:
   store i32 %4, i32 addrspace(1)* %out
   ret void
 }
+
+; The two S_ADD instructions should write to different registers, since
+; different threads will take different control flow paths.
+
+; SI-LABEL: {{^}}sgpr_if_else_valu_br:
+; SI: s_add_i32 [[SGPR:s[0-9]+]]
+; SI-NOT: s_add_i32 [[SGPR]]
+
+define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid_f = uitofp i32 %tid to float
+  %tmp1 = fcmp ueq float %tid_f, 0.0
+  br i1 %tmp1, label %if, label %else
+
+if:
+  %tmp2 = add i32 %b, %c
+  br label %endif
+
+else:
+  %tmp3 = add i32 %d, %e
+  br label %endif
+
+endif:
+  %tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else]
+  store i32 %tmp4, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
index 9d8a623..aa97fbf 100644
--- a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
+++ b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
@@ -3,8 +3,8 @@
 ; Copy VGPR -> SGPR used twice as an instruction operand, which is then
 ; used in an REG_SEQUENCE that also needs to be handled.
 
-; SI-LABEL: @test_dup_operands:
-; SI: V_ADD_I32_e32
+; SI-LABEL: {{^}}test_dup_operands:
+; SI: v_add_i32_e32
 define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
   %a = load <2 x i32> addrspace(1)* %in
   %lo = extractelement <2 x i32> %a, i32 0
diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll
index c7d5bf9..8daf753 100644
--- a/test/CodeGen/R600/sgpr-copy.ll
+++ b/test/CodeGen/R600/sgpr-copy.ll
@@ -2,9 +2,9 @@
 
 ; This test checks that no VGPR to SGPR copies are created by the register
 ; allocator.
-; CHECK-LABEL: @phi1
-; CHECK: S_BUFFER_LOAD_DWORD [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0
-; CHECK: V_MOV_B32_e32 v{{[0-9]}}, [[DST]]
+; CHECK-LABEL: {{^}}phi1:
+; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0
+; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
 
 define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
@@ -29,7 +29,7 @@ ENDIF:                                            ; preds = %main_body, %ELSE
 }
 
 ; Make sure this program doesn't crash
-; CHECK-LABEL: @phi2
+; CHECK-LABEL: {{^}}phi2:
 define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
@@ -149,7 +149,7 @@ ENDIF24:                                          ; preds = %ENDIF, %IF25
 }
 
 ; We just want ot make sure the program doesn't crash
-; CHECK-LABEL: @loop
+; CHECK-LABEL: {{^}}loop:
 
 define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
@@ -227,11 +227,11 @@ declare i32 @llvm.SI.packf16(float, float) #1
 ; registers were being identified as an SGPR regclass which was causing
 ; an assertion failure.
 
-; CHECK-LABEL: @sample_v3
-; CHECK: IMAGE_SAMPLE
-; CHECK: IMAGE_SAMPLE
-; CHECK: EXP
-; CHECK: S_ENDPGM
+; CHECK-LABEL: {{^}}sample_v3:
+; CHECK: image_sample
+; CHECK: image_sample
+; CHECK: exp
+; CHECK: s_endpgm
 define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 
 entry:
@@ -269,10 +269,10 @@ endif:
 
 !2 = metadata !{metadata !"const", null, i32 1}
 
-; CHECK-LABEL: @copy1
-; CHECK: BUFFER_LOAD_DWORD
-; CHECK: V_ADD
-; CHECK: S_ENDPGM
+; CHECK-LABEL: {{^}}copy1:
+; CHECK: buffer_load_dword
+; CHECK: v_add
+; CHECK: s_endpgm
 define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
 entry:
   %0 = load float addrspace(1)* %in0
@@ -296,8 +296,8 @@ endif:
 }
 
 ; This test is just checking that we don't crash / assertion fail.
-; CHECK-LABEL: @copy2
-; CHECK: S_ENDPGM
+; CHECK-LABEL: {{^}}copy2:
+; CHECK: s_endpgm
 
 define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 entry:
@@ -325,3 +325,54 @@ ENDIF69:
 
 attributes #0 = { "ShaderType"="0" }
 
+; This test checks that image_sample resource descriptors aren't loaded into
+; vgprs.  The verifier will fail if this happens.
+; CHECK-LABEL:{{^}}sample_rsrc:
+; CHECK: image_sample
+; CHECK: image_sample
+; CHECK: s_endpgm
+define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+bb:
+  %tmp = getelementptr [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
+  %tmp22 = load <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16)
+  %tmp25 = getelementptr [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
+  %tmp26 = load <8 x i32> addrspace(2)* %tmp25, !tbaa !0
+  %tmp27 = getelementptr [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
+  %tmp28 = load <4 x i32> addrspace(2)* %tmp27, !tbaa !0
+  %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7)
+  %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7)
+  %tmp31 = bitcast float %tmp23 to i32
+  %tmp36 = icmp ne i32 %tmp31, 0
+  br i1 %tmp36, label %bb38, label %bb80
+
+bb38:                                             ; preds = %bb
+  %tmp52 = bitcast float %tmp29 to i32
+  %tmp53 = bitcast float %tmp30 to i32
+  %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0
+  %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1
+  %tmp56 = bitcast <8 x i32> %tmp26 to <32 x i8>
+  %tmp57 = bitcast <4 x i32> %tmp28 to <16 x i8>
+  %tmp58 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp55, <32 x i8> %tmp56, <16 x i8> %tmp57, i32 2)
+  br label %bb71
+
+bb80:                                             ; preds = %bb
+  %tmp81 = bitcast float %tmp29 to i32
+  %tmp82 = bitcast float %tmp30 to i32
+  %tmp82.2 = add i32 %tmp82, 1
+  %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0
+  %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1
+  %tmp85 = bitcast <8 x i32> %tmp26 to <32 x i8>
+  %tmp86 = bitcast <4 x i32> %tmp28 to <16 x i8>
+  %tmp87 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp84, <32 x i8> %tmp85, <16 x i8> %tmp86, i32 2)
+  br label %bb71
+
+bb71:                                             ; preds = %bb80, %bb38
+  %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ]
+  %tmp88 = extractelement <4 x float> %tmp72, i32 0
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88)
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/shared-op-cycle.ll b/test/CodeGen/R600/shared-op-cycle.ll
index 0484fc9..f52a9ba 100644
--- a/test/CodeGen/R600/shared-op-cycle.ll
+++ b/test/CodeGen/R600/shared-op-cycle.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: @main
+; CHECK: {{^}}main:
 ; CHECK: MULADD_IEEE *
 ; CHECK-NOT: MULADD_IEEE *
 
diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
index 43fab2a..71c9fc4 100644
--- a/test/CodeGen/R600/shl.ll
+++ b/test/CodeGen/R600/shl.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK: @shl_v2i32
+;EG-CHECK: {{^}}shl_v2i32:
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @shl_v2i32
-;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: {{^}}shl_v2i32:
+;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +18,17 @@ define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG-CHECK: @shl_v4i32
+;EG-CHECK: {{^}}shl_v4i32:
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @shl_v4i32
-;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: {{^}}shl_v4i32:
+;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,7 +39,7 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG-CHECK: @shl_i64
+;EG-CHECK: {{^}}shl_i64:
 ;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG-CHECK: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
 ;EG-CHECK: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
@@ -51,8 +51,8 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 ;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
 
-;SI-CHECK: @shl_i64
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: {{^}}shl_i64:
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
@@ -63,7 +63,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK: @shl_v2i64
+;EG-CHECK: {{^}}shl_v2i64:
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHA]]
@@ -85,9 +85,9 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ;EG-CHECK-DAG: CNDE_INT
 ;EG-CHECK-DAG: CNDE_INT
 
-;SI-CHECK: @shl_v2i64
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: {{^}}shl_v2i64:
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
@@ -98,7 +98,7 @@ define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in
   ret void
 }
 
-;EG-CHECK: @shl_v4i64
+;EG-CHECK: {{^}}shl_v4i64:
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
@@ -140,11 +140,11 @@ define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in
 ;EG-CHECK-DAG: CNDE_INT
 ;EG-CHECK-DAG: CNDE_INT
 
-;SI-CHECK: @shl_v4i64
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: {{^}}shl_v4i64:
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
diff --git a/test/CodeGen/R600/shl_add_constant.ll b/test/CodeGen/R600/shl_add_constant.ll
new file mode 100644
index 0000000..801f77d
--- /dev/null
+++ b/test/CodeGen/R600/shl_add_constant.ll
@@ -0,0 +1,90 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Test with inline immediate
+
+; FUNC-LABEL: {{^}}shl_2_add_9_i32:
+; SI: v_lshlrev_b32_e32  [[REG:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 36, [[REG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr = getelementptr i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32 addrspace(1)* %ptr, align 4
+  %add = add i32 %val, 9
+  %result = shl i32 %add, 2
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}shl_2_add_9_i32_2_add_uses:
+; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], 9, {{v[0-9]+}}
+; SI-DAG: v_lshlrev_b32_e32 [[SHLREG:v[0-9]+]], 2, {{v[0-9]+}}
+; SI-DAG: buffer_store_dword [[ADDREG]]
+; SI-DAG: buffer_store_dword [[SHLREG]]
+; SI: s_endpgm
+define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr = getelementptr i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32 addrspace(1)* %ptr, align 4
+  %add = add i32 %val, 9
+  %result = shl i32 %add, 2
+  store i32 %result, i32 addrspace(1)* %out0, align 4
+  store i32 %add, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; Test with add literal constant
+
+; FUNC-LABEL: {{^}}shl_2_add_999_i32:
+; SI: v_lshlrev_b32_e32  [[REG:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 0xf9c, [[REG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr = getelementptr i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32 addrspace(1)* %ptr, align 4
+  %shl = add i32 %val, 999
+  %result = shl i32 %shl, 2
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_add_shl_add_constant:
+; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
+; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
+; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
+; SI: buffer_store_dword [[VRESULT]]
+define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %add.0 = add i32 %x, 123
+  %shl = shl i32 %add.0, 3
+  %add.1 = add i32 %shl, %y
+   store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv:
+; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
+; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
+; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
+; SI: buffer_store_dword [[VRESULT]]
+
+define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %add.0 = add i32 %x, 123
+  %shl = shl i32 %add.0, 3
+  %add.1 = add i32 %y, %shl
+  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/shl_add_ptr.ll b/test/CodeGen/R600/shl_add_ptr.ll
new file mode 100644
index 0000000..047cf25
--- /dev/null
+++ b/test/CodeGen/R600/shl_add_ptr.ll
@@ -0,0 +1,282 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+
+; Test that doing a shift of a pointer with a constant add will be
+; folded into the constant offset addressing mode even if the add has
+; multiple uses. This is relevant to accessing 2 separate, adjacent
+; LDS globals.
+
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+@lds0 = addrspace(3) global [512 x float] undef, align 4
+@lds1 = addrspace(3) global [512 x float] undef, align 4
+
+
+; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8
+
+; SI-LABEL: {{^}}load_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8 [M0]
+; SI: s_endpgm
+define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  store float %val0, float addrspace(1)* %out
+  ret void
+}
+
+; Make sure once the first use is folded into the addressing mode, the
+; remaining add use goes through the normal shl + add constant fold.
+
+; SI-LABEL: {{^}}load_shl_base_lds_1:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 [M0]
+; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}}
+; SI-DAG: buffer_store_dword [[RESULT]]
+; SI-DAG: buffer_store_dword [[ADDUSE]]
+; SI: s_endpgm
+define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %shl_add_use = shl i32 %idx.0, 2
+  store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4
+  store float %val0, float addrspace(1)* %out
+  ret void
+}
+
+@maxlds = addrspace(3) global [65536 x i8] undef, align 4
+
+; SI-LABEL: {{^}}load_shl_base_lds_max_offset
+; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
+; SI: s_endpgm
+define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 65535
+  %arrayidx0 = getelementptr inbounds [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
+  %val0 = load i8 addrspace(3)* %arrayidx0
+  store i32 %idx.0, i32 addrspace(1)* %add_use
+  store i8 %val0, i8 addrspace(1)* %out
+  ret void
+}
+
+; The two globals are placed adjacent in memory, so the same base
+; pointer can be used with an offset into the second one.
+
+; SI-LABEL: {{^}}load_shl_base_lds_2:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 [M0]
+; SI: s_endpgm
+define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 64
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  store float %sum, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}store_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8 [M0]
+; SI: s_endpgm
+define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  store float 1.0, float addrspace(3)* %arrayidx0, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+
+; --------------------------------------------------------------------------------
+; Atomics.
+
+@lds2 = addrspace(3) global [512 x i32] undef, align 4
+
+; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+;   %idx.0 = add nsw i32 %tid.x, 2
+;   %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+;   %val = load atomic i32 addrspace(3)* %arrayidx0 seq_cst, align 4
+;   store i32 %val, i32 addrspace(1)* %out, align 4
+;   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+;   ret void
+; }
+
+
+; SI-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_swap_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_add_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_sub_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_and_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_or_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_xor_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+;   %idx.0 = add nsw i32 %tid.x, 2
+;   %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+;   %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+;   store i32 %val, i32 addrspace(1)* %out, align 4
+;   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+;   ret void
+; }
+
+; SI-LABEL: {{^}}atomic_min_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_max_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_umin_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_umax_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/si-annotate-cf-assertion.ll b/test/CodeGen/R600/si-annotate-cf-assertion.ll
index daa4667..6d60b0a 100644
--- a/test/CodeGen/R600/si-annotate-cf-assertion.ll
+++ b/test/CodeGen/R600/si-annotate-cf-assertion.ll
@@ -4,7 +4,7 @@
 
 
 define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
-; CHECK-LABEL: @test:
+; CHECK-LABEL: {{^}}test:
 
 entry:
   switch i32 %x, label %sw.default [
diff --git a/test/CodeGen/R600/si-lod-bias.ll b/test/CodeGen/R600/si-lod-bias.ll
index 8d7a79c..60277d6 100644
--- a/test/CodeGen/R600/si-lod-bias.ll
+++ b/test/CodeGen/R600/si-lod-bias.ll
@@ -3,8 +3,8 @@
 ; This shader has the potential to generated illegal VGPR to SGPR copies if
 ; the wrong register class is used for the REG_SEQUENCE instructions.
 
-; CHECK: @main
-; CHECK: IMAGE_SAMPLE_B v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}}
+; CHECK: {{^}}main:
+; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}}
 
 define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
diff --git a/test/CodeGen/R600/si-sgpr-spill.ll b/test/CodeGen/R600/si-sgpr-spill.ll
index 53a0965..439d8e2 100644
--- a/test/CodeGen/R600/si-sgpr-spill.ll
+++ b/test/CodeGen/R600/si-sgpr-spill.ll
@@ -3,10 +3,10 @@
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
 
-; CHECK-LABEL: @main
+; CHECK-LABEL: {{^}}main:
 ; Writing to M0 from an SMRD instruction will hang the GPU.
-; CHECK-NOT: S_BUFFER_LOAD_DWORD m0
-; CHECK: S_ENDPGM
+; CHECK-NOT: s_buffer_load_dword m0
+; CHECK: s_endpgm
 @ddxy_lds = external addrspace(3) global [64 x i32]
 
 define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
@@ -688,8 +688,8 @@ attributes #4 = { nounwind readonly }
 
 !0 = metadata !{metadata !"const", null, i32 1}
 
-; CHECK-LABEL: @main1
-; CHECK: S_ENDPGM
+; CHECK-LABEL: {{^}}main1:
+; CHECK: s_endpgm
 define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %21 = getelementptr [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
diff --git a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
new file mode 100644
index 0000000..2c146eb
--- /dev/null
+++ b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
@@ -0,0 +1,238 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
+
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.AMDGPU.barrier.local() #2
+
+
+@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
+@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8
+@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
+
+; FUNC-LABEL: @reorder_local_load_global_store_local_load
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: buffer_store_dword
+define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+  %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2
+
+  %tmp1 = load i32 addrspace(3)* %ptr1, align 4
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32 addrspace(3)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI: buffer_store_dword
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+  %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2
+
+  %tmp1 = load i32 addrspace(3)* %ptr1, align 4
+  store volatile i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32 addrspace(3)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI: buffer_store_dword
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+  %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2
+
+  %tmp1 = load i32 addrspace(3)* %ptr1, align 4
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  call void @llvm.AMDGPU.barrier.local() #2
+  %tmp2 = load i32 addrspace(3)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Technically we could reorder these, but just comparing the
+; instruction type of the load is insufficient.
+
+; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load
+; CI: buffer_load_dword
+; CI: buffer_store_dword
+; CI: buffer_load_dword
+; CI: buffer_store_dword
+define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
+
+  %ptr1 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2
+
+  %tmp1 = load i32 addrspace(2)* %ptr1, align 4
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32 addrspace(2)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX: Should be able to reorder this, but the laods count as ordered
+
+; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
+; CI: buffer_load_dword
+; CI: ds_write_b32
+; CI: buffer_load_dword
+; CI: buffer_store_dword
+define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
+  %ptr0 = load i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
+
+  %ptr1 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2
+
+  %tmp1 = load i32 addrspace(2)* %ptr1, align 4
+  store i32 99, i32 addrspace(3)* %lptr, align 4
+  %tmp2 = load i32 addrspace(2)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load
+; CI: s_load_dword
+; CI: s_load_dword
+; CI: s_load_dword
+; CI: ds_write_b32
+; CI: buffer_store_dword
+define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2
+
+  %tmp1 = load i32 addrspace(2)* %ptr1, align 4
+  store i32 99, i32 addrspace(3)* %lptr, align 4
+  %tmp2 = load i32 addrspace(2)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_global_load_local_store_global_load
+; CI: buffer_load_dword
+; CI: buffer_load_dword
+; CI: ds_write_b32
+; CI: buffer_store_dword
+define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32 addrspace(1)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32 addrspace(1)* %ptr0, i64 2
+
+  %tmp1 = load i32 addrspace(1)* %ptr1, align 4
+  store i32 99, i32 addrspace(3)* %lptr, align 4
+  %tmp2 = load i32 addrspace(1)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_local_offsets
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: buffer_store_dword
+; CI: s_endpgm
+define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 100
+  %ptr3 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 101
+
+  store i32 123, i32 addrspace(3)* %ptr1, align 4
+  %tmp1 = load i32 addrspace(3)* %ptr2, align 4
+  %tmp2 = load i32 addrspace(3)* %ptr3, align 4
+  store i32 123, i32 addrspace(3)* %ptr2, align 4
+  %tmp3 = load i32 addrspace(3)* %ptr1, align 4
+  store i32 789, i32 addrspace(3)* %ptr3, align 4
+
+  %add.0 = add nsw i32 %tmp2, %tmp1
+  %add.1 = add nsw i32 %add.0, %tmp3
+  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_global_offsets
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0xc
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x190
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x194
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x190
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x194
+; CI: buffer_store_dword
+; CI: s_endpgm
+define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32 addrspace(1)* %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32 addrspace(1)* %ptr0, i32 100
+  %ptr3 = getelementptr inbounds i32 addrspace(1)* %ptr0, i32 101
+
+  store i32 123, i32 addrspace(1)* %ptr1, align 4
+  %tmp1 = load i32 addrspace(1)* %ptr2, align 4
+  %tmp2 = load i32 addrspace(1)* %ptr3, align 4
+  store i32 123, i32 addrspace(1)* %ptr2, align 4
+  %tmp3 = load i32 addrspace(1)* %ptr1, align 4
+  store i32 789, i32 addrspace(1)* %ptr3, align 4
+
+  %add.0 = add nsw i32 %tmp2, %tmp1
+  %add.1 = add nsw i32 %add.0, %tmp3
+  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load
+; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4
+; XCI: TBUFFER_STORE_FORMAT
+; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8
+; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 {
+;   %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+;   %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1
+;   %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2
+
+;   %tmp1 = load i32 addrspace(3)* %ptr1, align 4
+
+;   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+;   call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+;         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
+;         i32 1, i32 0)
+
+;   %tmp2 = load i32 addrspace(3)* %ptr2, align 4
+
+;   %add = add nsw i32 %tmp1, %tmp2
+
+;   store i32 %add, i32 addrspace(1)* %out, align 4
+;   ret void
+; }
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { nounwind noduplicate }
diff --git a/test/CodeGen/R600/si-vector-hang.ll b/test/CodeGen/R600/si-vector-hang.ll
index 093234f..6f91c71 100644
--- a/test/CodeGen/R600/si-vector-hang.ll
+++ b/test/CodeGen/R600/si-vector-hang.ll
@@ -1,14 +1,14 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-; CHECK: @test_8_min_char
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
+; CHECK: {{^}}test_8_min_char:
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
 ; ModuleID = 'radeon'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
 target triple = "r600--"
diff --git a/test/CodeGen/R600/sign_extend.ll b/test/CodeGen/R600/sign_extend.ll
index e3bee50..94f4c46 100644
--- a/test/CodeGen/R600/sign_extend.ll
+++ b/test/CodeGen/R600/sign_extend.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @s_sext_i1_to_i32:
-; SI: V_CNDMASK_B32_e64
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}s_sext_i1_to_i32:
+; SI: v_cndmask_b32_e64
+; SI: s_endpgm
 define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i32
@@ -10,10 +10,10 @@ define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: @test:
-; SI: V_ASHR
-; SI: S_ENDPG
-define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
+; SI-LABEL: {{^}}test_s_sext_i32_to_i64:
+; SI: s_ashr_i32
+; SI: s_endpg
+define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
 entry:
   %mul = mul i32 %a, %b
   %add = add i32 %mul, %c
@@ -22,10 +22,10 @@ entry:
   ret void
 }
 
-; SI-LABEL: @s_sext_i1_to_i64:
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}s_sext_i1_to_i64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: s_endpgm
 define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i64
@@ -33,18 +33,18 @@ define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   ret void
 }
 
-; SI-LABEL: @s_sext_i32_to_i64:
-; SI: S_ASHR_I32
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}s_sext_i32_to_i64:
+; SI: s_ashr_i32
+; SI: s_endpgm
 define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
   %sext = sext i32 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @v_sext_i32_to_i64:
-; SI: V_ASHR
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}v_sext_i32_to_i64:
+; SI: v_ashr
+; SI: s_endpgm
 define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %sext = sext i32 %val to i64
@@ -52,8 +52,8 @@ define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) no
   ret void
 }
 
-; SI-LABEL: @s_sext_i16_to_i64:
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}s_sext_i16_to_i64:
+; SI: s_endpgm
 define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
   %sext = sext i16 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
diff --git a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
index dee4326..8d9ee42 100644
--- a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
+++ b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
@@ -1,6 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
-
 ; XFAIL: *
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s
 
 ; 64-bit select was originally lowered with a build_pair, and this
 ; could be simplified to 1 cndmask instead of 2, but that broken when
@@ -15,10 +14,10 @@ define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) {
 }
 
 ; FIXME: Fix truncating store for local memory
-; SI-LABEL: @trunc_load_alloca_i64:
-; SI: DS_READ_B32
-; SI-NOT: DS_READ_B64
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}trunc_load_alloca_i64:
+; SI: v_movrels_b32
+; SI-NOT: v_movrels_b32
+; SI: s_endpgm
 define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
   %idx = add i32 %a, %b
   %alloca = alloca i64, i32 4
diff --git a/test/CodeGen/R600/sint_to_fp.f64.ll b/test/CodeGen/R600/sint_to_fp.f64.ll
new file mode 100644
index 0000000..6e4f87c
--- /dev/null
+++ b/test/CodeGen/R600/sint_to_fp.f64.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: {{^}}sint_to_fp_i32_to_f64
+; SI: v_cvt_f64_i32_e32
+define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+  %result = sitofp i32 %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}sint_to_fp_i1_f64:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
+; we should be able to fold the SGPRs into the V_CNDMASK instructions.
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = sitofp i1 %cmp to double
+  store double %fp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}sint_to_fp_i1_f64_load:
+; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, -1
+; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
+  %fp = sitofp i1 %in to double
+  store double %fp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @s_sint_to_fp_i64_to_f64
+define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+  %result = sitofp i64 %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: @v_sint_to_fp_i64_to_f64
+; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI-DAG: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
+; SI-DAG: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
+; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr i64 addrspace(1)* %in, i32 %tid
+  %val = load i64 addrspace(1)* %gep, align 8
+  %result = sitofp i64 %val to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index b27dfda..7b6ce43 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll
@@ -1,28 +1,38 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
-
-; R600-CHECK: @sint_to_fp_v2i32
-; R600-CHECK-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
-; R600-CHECK-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-; SI-CHECK: @sint_to_fp_v2i32
-; SI-CHECK: V_CVT_F32_I32_e32
-; SI-CHECK: V_CVT_F32_I32_e32
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}s_sint_to_fp_i32_to_f32:
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}}
+define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) {
+  %result = sitofp i32 %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sint_to_fp_v2i32:
+; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
 define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
   %result = sitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK: @sint_to_fp_v4i32
-; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: @sint_to_fp_v4i32
-; SI-CHECK: V_CVT_F32_I32_e32
-; SI-CHECK: V_CVT_F32_I32_e32
-; SI-CHECK: V_CVT_F32_I32_e32
-; SI-CHECK: V_CVT_F32_I32_e32
+; FUNC-LABEL: {{^}}sint_to_fp_v4i32:
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
 define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %value = load <4 x i32> addrspace(1) * %in
   %result = sitofp <4 x i32> %value to <4 x float>
@@ -30,11 +40,11 @@ define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspac
   ret void
 }
 
-; FUNC-LABEL: @sint_to_fp_i1_f32:
-; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; SI-NEXT: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, -1.000000e+00, [[CMP]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sint_to_fp_i1_f32:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
@@ -42,10 +52,10 @@ define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
   ret void
 }
 
-; FUNC-LABEL: @sint_to_fp_i1_f32_load:
-; SI: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, -1.000000e+00
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sint_to_fp_i1_f32_load:
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
   %fp = sitofp i1 %in to float
   store float %fp, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/sint_to_fp64.ll b/test/CodeGen/R600/sint_to_fp64.ll
deleted file mode 100644
index 12b8cf5..0000000
--- a/test/CodeGen/R600/sint_to_fp64.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI: @sint_to_fp64
-; SI: V_CVT_F64_I32_e32
-define void @sint_to_fp64(double addrspace(1)* %out, i32 %in) {
-  %result = sitofp i32 %in to double
-  store double %result, double addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: @sint_to_fp_i1_f64:
-; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
-; we should be able to fold the SGPRs into the V_CNDMASK instructions.
-; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
-define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
-  %cmp = icmp eq i32 %in, 0
-  %fp = sitofp i1 %cmp to double
-  store double %fp, double addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: @sint_to_fp_i1_f64_load:
-; SI: V_CNDMASK_B32_e64 [[IRESULT:v[0-9]]], 0, -1
-; SI-NEXT: V_CVT_F64_I32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]]
-; SI: S_ENDPGM
-define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
-  %fp = sitofp i1 %in to double
-  store double %fp, double addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll
index dec6185..1c7df16 100644
--- a/test/CodeGen/R600/smrd.ll
+++ b/test/CodeGen/R600/smrd.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck %s
 
 ; SMRD load with an immediate offset.
-; CHECK-LABEL: @smrd0
-; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
+; CHECK-LABEL: {{^}}smrd0:
+; CHECK: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 1
@@ -12,8 +12,8 @@ entry:
 }
 
 ; SMRD load with the largest possible immediate offset.
-; CHECK-LABEL: @smrd1
-; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; CHECK-LABEL: {{^}}smrd1:
+; CHECK: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 255
@@ -23,9 +23,10 @@ entry:
 }
 
 ; SMRD load with an offset greater than the largest possible immediate.
-; CHECK-LABEL: @smrd2
-; CHECK: S_MOV_B32 s[[OFFSET:[0-9]]], 0x400
-; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; CHECK-LABEL: {{^}}smrd2:
+; CHECK: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; CHECK: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; CHECK: s_endpgm
 define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 256
@@ -35,15 +36,15 @@ entry:
 }
 
 ; SMRD load with a 64-bit offset
-; CHECK-LABEL: @smrd3
-; CHECK-DAG: S_MOV_B32 s[[SHI:[0-9]+]], 4
-; CHECK-DAG: S_MOV_B32 s[[SLO:[0-9]+]], 0
+; CHECK-LABEL: {{^}}smrd3:
+; CHECK-DAG: s_mov_b32 s[[SHI:[0-9]+]], 4
+; CHECK-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
 ; FIXME: We don't need to copy these values to VGPRs
-; CHECK-DAG: V_MOV_B32_e32 v[[VHI:[0-9]+]], s[[SHI]]
-; CHECK-DAG: V_MOV_B32_e32 v[[VLO:[0-9]+]], s[[SLO]]
-; FIXME: We should be able to use S_LOAD_DWORD here
-; BUFFER_LOAD_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] + v[[[VLO]]:[[VHI]]] + 0x0
-
+; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; FIXME: We should be able to use s_load_dword here
+; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64
+; CHECK: s_endpgm
 define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
@@ -53,8 +54,8 @@ entry:
 }
 
 ; SMRD load using the load.const intrinsic with an immediate offset
-; CHECK-LABEL: @smrd_load_const0
-; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
+; CHECK-LABEL: {{^}}smrd_load_const0:
+; CHECK: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
@@ -64,10 +65,10 @@ main_body:
   ret void
 }
 
-; SMRD load using the load.const intrinsic with an offset greater largest possible
-; immediate offset.
-; CHECK-LABEL: @smrd_load_const1
-; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; SMRD load using the load.const intrinsic with the largest possible immediate
+; offset.
+; CHECK-LABEL: {{^}}smrd_load_const1:
+; CHECK: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
@@ -76,10 +77,12 @@ main_body:
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
   ret void
 }
-; SMRD load using the load.const intrinsic with the largetst possible
+; SMRD load using the load.const intrinsic with an offset greater than the
+; largets possible immediate.
 ; immediate offset.
-; CHECK-LABEL: @smrd_load_const2
-; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; CHECK-LABEL: {{^}}smrd_load_const2:
+; CHECK: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; CHECK: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
diff --git a/test/CodeGen/R600/split-scalar-i64-add.ll b/test/CodeGen/R600/split-scalar-i64-add.ll
new file mode 100644
index 0000000..e3448dc
--- /dev/null
+++ b/test/CodeGen/R600/split-scalar-i64-add.ll
@@ -0,0 +1,48 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() readnone
+
+; This is broken because the low half of the 64-bit add remains on the
+; SALU, but the upper half does not. The addc expects the carry bit
+; set in vcc, which is undefined since the low scalar half add sets
+; scc instead.
+
+; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
+  %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, 399
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
+  %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, %val1
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Doesn't use constants
+; FUNC-LABEL @imp_def_vcc_split_i64_add_2
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %gep = getelementptr i32 addrspace(1)* %in, i32 %tid
+  %load = load i32 addrspace(1)* %gep
+  %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 %load, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, %val1
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll
index 9eb3dc5..8ba9daa 100644
--- a/test/CodeGen/R600/sra.ll
+++ b/test/CodeGen/R600/sra.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK-LABEL: @ashr_v2i32
+;EG-CHECK-LABEL: {{^}}ashr_v2i32:
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK-LABEL: @ashr_v2i32
-;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK-LABEL: {{^}}ashr_v2i32:
+;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +18,17 @@ define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
   ret void
 }
 
-;EG-CHECK-LABEL: @ashr_v4i32
+;EG-CHECK-LABEL: {{^}}ashr_v4i32:
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK-LABEL: @ashr_v4i32
-;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK-LABEL: {{^}}ashr_v4i32:
+;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,11 +39,11 @@ define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   ret void
 }
 
-;EG-CHECK-LABEL: @ashr_i64
+;EG-CHECK-LABEL: {{^}}ashr_i64:
 ;EG-CHECK: ASHR
 
-;SI-CHECK-LABEL: @ashr_i64
-;SI-CHECK: S_ASHR_I64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
+;SI-CHECK-LABEL: {{^}}ashr_i64:
+;SI-CHECK: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
 define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = sext i32 %in to i64
@@ -52,7 +52,7 @@ entry:
   ret void
 }
 
-;EG-CHECK-LABEL: @ashr_i64_2
+;EG-CHECK-LABEL: {{^}}ashr_i64_2:
 ;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG-CHECK: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
@@ -66,8 +66,8 @@ entry:
 ;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 
-;SI-CHECK-LABEL: @ashr_i64_2
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK-LABEL: {{^}}ashr_i64_2:
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
@@ -78,7 +78,7 @@ entry:
   ret void
 }
 
-;EG-CHECK-LABEL: @ashr_v2i64
+;EG-CHECK-LABEL: {{^}}ashr_v2i64:
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
@@ -104,9 +104,9 @@ entry:
 ;EG-CHECK-DAG: CNDE_INT
 ;EG-CHECK-DAG: CNDE_INT
 
-;SI-CHECK-LABEL: @ashr_v2i64
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK-LABEL: {{^}}ashr_v2i64:
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
@@ -117,7 +117,7 @@ define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
   ret void
 }
 
-;EG-CHECK-LABEL: @ashr_v4i64
+;EG-CHECK-LABEL: {{^}}ashr_v4i64:
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
@@ -167,11 +167,11 @@ define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ;EG-CHECK-DAG: CNDE_INT
 ;EG-CHECK-DAG: CNDE_INT
 
-;SI-CHECK-LABEL: @ashr_v4i64
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK-LABEL: {{^}}ashr_v4i64:
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll
index 44ad73f..8c5daf6 100644
--- a/test/CodeGen/R600/srl.ll
+++ b/test/CodeGen/R600/srl.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK: @lshr_v2i32
+;EG-CHECK: {{^}}lshr_v2i32:
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @lshr_v2i32
-;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: {{^}}lshr_v2i32:
+;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -19,17 +19,17 @@ define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 }
 
 
-;EG-CHECK: @lshr_v4i32
+;EG-CHECK: {{^}}lshr_v4i32:
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @lshr_v4i32
-;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: {{^}}lshr_v4i32:
+;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -40,7 +40,7 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   ret void
 }
 
-;EG-CHECK: @lshr_i64
+;EG-CHECK: {{^}}lshr_i64:
 ;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG-CHECK: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
@@ -53,8 +53,8 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 ;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
 
-;SI-CHECK: @lshr_i64
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: {{^}}lshr_i64:
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
@@ -65,7 +65,7 @@ define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK: @lshr_v2i64
+;EG-CHECK: {{^}}lshr_v2i64:
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
@@ -89,9 +89,9 @@ define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ;EG-CHECK-DAG: CNDE_INT
 ;EG-CHECK-DAG: CNDE_INT
 
-;SI-CHECK: @lshr_v2i64
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: {{^}}lshr_v2i64:
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
@@ -103,7 +103,7 @@ define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 }
 
 
-;EG-CHECK: @lshr_v4i64
+;EG-CHECK: {{^}}lshr_v4i64:
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
@@ -151,11 +151,11 @@ define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ;EG-CHECK-DAG: CNDE_INT
 ;EG-CHECK-DAG: CNDE_INT
 
-;SI-CHECK: @lshr_v4i64
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: {{^}}lshr_v4i64:
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1
diff --git a/test/CodeGen/R600/ssubo.ll b/test/CodeGen/R600/ssubo.ll
index b330276..8031c6f 100644
--- a/test/CodeGen/R600/ssubo.ll
+++ b/test/CodeGen/R600/ssubo.ll
@@ -4,7 +4,7 @@
 declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
 
-; FUNC-LABEL: @ssubo_i64_zext
+; FUNC-LABEL: {{^}}ssubo_i64_zext:
 define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
@@ -15,7 +15,7 @@ define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @s_ssubo_i32
+; FUNC-LABEL: {{^}}s_ssubo_i32:
 define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %ssub, 0
@@ -25,7 +25,7 @@ define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
   ret void
 }
 
-; FUNC-LABEL: @v_ssubo_i32
+; FUNC-LABEL: {{^}}v_ssubo_i32:
 define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32 addrspace(1)* %aptr, align 4
   %b = load i32 addrspace(1)* %bptr, align 4
@@ -37,9 +37,9 @@ define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
   ret void
 }
 
-; FUNC-LABEL: @s_ssubo_i64
-; SI: S_SUB_I32
-; SI: S_SUBB_U32
+; FUNC-LABEL: {{^}}s_ssubo_i64:
+; SI: s_sub_u32
+; SI: s_subb_u32
 define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
@@ -49,9 +49,9 @@ define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
   ret void
 }
 
-; FUNC-LABEL: @v_ssubo_i64
-; SI: V_SUB_I32_e32
-; SI: V_SUBB_U32_e32
+; FUNC-LABEL: {{^}}v_ssubo_i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
 define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64 addrspace(1)* %aptr, align 4
   %b = load i64 addrspace(1)* %bptr, align 4
diff --git a/test/CodeGen/R600/store-v3i32.ll b/test/CodeGen/R600/store-v3i32.ll
index 3357803..0f28f33 100644
--- a/test/CodeGen/R600/store-v3i32.ll
+++ b/test/CodeGen/R600/store-v3i32.ll
@@ -4,8 +4,8 @@
 ; 3 vectors have the same size and alignment as 4 vectors, so this
 ; should be done in a single store.
 
-; SI-LABEL: @store_v3i32:
-; SI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}store_v3i32:
+; SI: buffer_store_dwordx4
 define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
   store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16
   ret void
diff --git a/test/CodeGen/R600/store-v3i64.ll b/test/CodeGen/R600/store-v3i64.ll
index 58d28b5..247a561 100644
--- a/test/CodeGen/R600/store-v3i64.ll
+++ b/test/CodeGen/R600/store-v3i64.ll
@@ -1,27 +1,27 @@
 ; XFAIL: *
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI
 
-; SI-LABEL: @global_store_v3i64:
-; SI: BUFFER_STORE_DWORDX4
-; SI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}global_store_v3i64:
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
   ret void
 }
 
-; SI-LABEL: @global_store_v3i64_unaligned:
+; SI-LABEL: {{^}}global_store_v3i64_unaligned:
 define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: @local_store_v3i64:
+; SI-LABEL: {{^}}local_store_v3i64:
 define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
   ret void
 }
 
-; SI-LABEL: @local_store_v3i64_unaligned:
+; SI-LABEL: {{^}}local_store_v3i64_unaligned:
 define void @local_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
   ret void
diff --git a/test/CodeGen/R600/store-vector-ptrs.ll b/test/CodeGen/R600/store-vector-ptrs.ll
index 41c5edc..aee639b 100644
--- a/test/CodeGen/R600/store-vector-ptrs.ll
+++ b/test/CodeGen/R600/store-vector-ptrs.ll
@@ -1,9 +1,11 @@
-; REQUIRES: asserts
-; XFAIL: *
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s
 
+; This tests for a bug that caused a crash in
+; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting
+; scratch loads and stores.
+; CHECK-LABEL: {{^}}store_vector_ptrs:
 define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
   %p = getelementptr <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   store <4 x i32*> %p, <4 x i32*>* %out
   ret void
-}
-\ No newline at end of file
+}
diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
index dd27533..713ecd6 100644
--- a/test/CodeGen/R600/store.ll
+++ b/test/CodeGen/R600/store.ll
@@ -5,9 +5,9 @@
 ;===------------------------------------------------------------------------===;
 ; Global Address Space
 ;===------------------------------------------------------------------------===;
-; FUNC-LABEL: @store_i1
+; FUNC-LABEL: {{^}}store_i1:
 ; EG-CHECK: MEM_RAT MSKOR
-; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK: buffer_store_byte
 define void @store_i1(i1 addrspace(1)* %out) {
 entry:
   store i1 true, i1 addrspace(1)* %out
@@ -15,7 +15,7 @@ entry:
 }
 
 ; i8 store
-; EG-CHECK-LABEL: @store_i8
+; EG-CHECK-LABEL: {{^}}store_i8:
 ; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
 ; EG-CHECK: VTX_READ_8 [[VAL:T[0-9]\.X]], [[VAL]]
 ; IG 0: Get the byte index and truncate the value
@@ -34,8 +34,8 @@ entry:
 ; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0
 ; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0
 
-; SI-CHECK-LABEL: @store_i8
-; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK-LABEL: {{^}}store_i8:
+; SI-CHECK: buffer_store_byte
 
 define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
 entry:
@@ -44,7 +44,7 @@ entry:
 }
 
 ; i16 store
-; EG-CHECK-LABEL: @store_i16
+; EG-CHECK-LABEL: {{^}}store_i16:
 ; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
 ; EG-CHECK: VTX_READ_16 [[VAL:T[0-9]\.X]], [[VAL]]
 ; IG 0: Get the byte index and truncate the value
@@ -63,20 +63,20 @@ entry:
 ; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0
 ; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0
 
-; SI-CHECK-LABEL: @store_i16
-; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK-LABEL: {{^}}store_i16:
+; SI-CHECK: buffer_store_short
 define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
 entry:
   store i16 %in, i16 addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_v2i8
+; EG-CHECK-LABEL: {{^}}store_v2i8:
 ; EG-CHECK: MEM_RAT MSKOR
 ; EG-CHECK-NOT: MEM_RAT MSKOR
-; SI-CHECK-LABEL: @store_v2i8
-; SI-CHECK: BUFFER_STORE_BYTE
-; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK-LABEL: {{^}}store_v2i8:
+; SI-CHECK: buffer_store_byte
+; SI-CHECK: buffer_store_byte
 define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
@@ -85,13 +85,13 @@ entry:
 }
 
 
-; EG-CHECK-LABEL: @store_v2i16
+; EG-CHECK-LABEL: {{^}}store_v2i16:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: @store_v2i16
+; CM-CHECK-LABEL: {{^}}store_v2i16:
 ; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: @store_v2i16
-; SI-CHECK: BUFFER_STORE_SHORT
-; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK-LABEL: {{^}}store_v2i16:
+; SI-CHECK: buffer_store_short
+; SI-CHECK: buffer_store_short
 define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
@@ -99,15 +99,15 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @store_v4i8
+; EG-CHECK-LABEL: {{^}}store_v4i8:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: @store_v4i8
+; CM-CHECK-LABEL: {{^}}store_v4i8:
 ; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: @store_v4i8
-; SI-CHECK: BUFFER_STORE_BYTE
-; SI-CHECK: BUFFER_STORE_BYTE
-; SI-CHECK: BUFFER_STORE_BYTE
-; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK-LABEL: {{^}}store_v4i8:
+; SI-CHECK: buffer_store_byte
+; SI-CHECK: buffer_store_byte
+; SI-CHECK: buffer_store_byte
+; SI-CHECK: buffer_store_byte
 define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
@@ -116,30 +116,30 @@ entry:
 }
 
 ; floating-point store
-; EG-CHECK-LABEL: @store_f32
+; EG-CHECK-LABEL: {{^}}store_f32:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
-; CM-CHECK-LABEL: @store_f32
+; CM-CHECK-LABEL: {{^}}store_f32:
 ; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK-LABEL: @store_f32
-; SI-CHECK: BUFFER_STORE_DWORD
+; SI-CHECK-LABEL: {{^}}store_f32:
+; SI-CHECK: buffer_store_dword
 
 define void @store_f32(float addrspace(1)* %out, float %in) {
   store float %in, float addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_v4i16
+; EG-CHECK-LABEL: {{^}}store_v4i16:
 ; EG-CHECK: MEM_RAT MSKOR
 ; EG-CHECK: MEM_RAT MSKOR
 ; EG-CHECK: MEM_RAT MSKOR
 ; EG-CHECK: MEM_RAT MSKOR
 ; EG-CHECK-NOT: MEM_RAT MSKOR
-; SI-CHECK-LABEL: @store_v4i16
-; SI-CHECK: BUFFER_STORE_SHORT
-; SI-CHECK: BUFFER_STORE_SHORT
-; SI-CHECK: BUFFER_STORE_SHORT
-; SI-CHECK: BUFFER_STORE_SHORT
-; SI-CHECK-NOT: BUFFER_STORE_BYTE
+; SI-CHECK-LABEL: {{^}}store_v4i16:
+; SI-CHECK: buffer_store_short
+; SI-CHECK: buffer_store_short
+; SI-CHECK: buffer_store_short
+; SI-CHECK: buffer_store_short
+; SI-CHECK-NOT: buffer_store_byte
 define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
@@ -148,12 +148,12 @@ entry:
 }
 
 ; vec2 floating-point stores
-; EG-CHECK-LABEL: @store_v2f32
+; EG-CHECK-LABEL: {{^}}store_v2f32:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: @store_v2f32
+; CM-CHECK-LABEL: {{^}}store_v2f32:
 ; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: @store_v2f32
-; SI-CHECK: BUFFER_STORE_DWORDX2
+; SI-CHECK-LABEL: {{^}}store_v2f32:
+; SI-CHECK: buffer_store_dwordx2
 
 define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
 entry:
@@ -163,23 +163,23 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @store_v4i32
+; EG-CHECK-LABEL: {{^}}store_v4i32:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
 ; EG-CHECK-NOT: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: @store_v4i32
+; CM-CHECK-LABEL: {{^}}store_v4i32:
 ; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
 ; CM-CHECK-NOT: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: @store_v4i32
-; SI-CHECK: BUFFER_STORE_DWORDX4
+; SI-CHECK-LABEL: {{^}}store_v4i32:
+; SI-CHECK: buffer_store_dwordx4
 define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @store_i64_i8
+; FUNC-LABEL: {{^}}store_i64_i8:
 ; EG-CHECK: MEM_RAT MSKOR
-; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK: buffer_store_byte
 define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
@@ -187,9 +187,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @store_i64_i16
+; FUNC-LABEL: {{^}}store_i64_i16:
 ; EG-CHECK: MEM_RAT MSKOR
-; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK: buffer_store_short
 define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
@@ -201,99 +201,99 @@ entry:
 ; Local Address Space
 ;===------------------------------------------------------------------------===;
 
-; FUNC-LABEL: @store_local_i1
+; FUNC-LABEL: {{^}}store_local_i1:
 ; EG-CHECK: LDS_BYTE_WRITE
-; SI-CHECK: DS_WRITE_B8
+; SI-CHECK: ds_write_b8
 define void @store_local_i1(i1 addrspace(3)* %out) {
 entry:
   store i1 true, i1 addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_local_i8
+; EG-CHECK-LABEL: {{^}}store_local_i8:
 ; EG-CHECK: LDS_BYTE_WRITE
-; SI-CHECK-LABEL: @store_local_i8
-; SI-CHECK: DS_WRITE_B8
+; SI-CHECK-LABEL: {{^}}store_local_i8:
+; SI-CHECK: ds_write_b8
 define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
   store i8 %in, i8 addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_local_i16
+; EG-CHECK-LABEL: {{^}}store_local_i16:
 ; EG-CHECK: LDS_SHORT_WRITE
-; SI-CHECK-LABEL: @store_local_i16
-; SI-CHECK: DS_WRITE_B16
+; SI-CHECK-LABEL: {{^}}store_local_i16:
+; SI-CHECK: ds_write_b16
 define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
   store i16 %in, i16 addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_local_v2i16
+; EG-CHECK-LABEL: {{^}}store_local_v2i16:
 ; EG-CHECK: LDS_WRITE
-; CM-CHECK-LABEL: @store_local_v2i16
+; CM-CHECK-LABEL: {{^}}store_local_v2i16:
 ; CM-CHECK: LDS_WRITE
-; SI-CHECK-LABEL: @store_local_v2i16
-; SI-CHECK: DS_WRITE_B16
-; SI-CHECK: DS_WRITE_B16
+; SI-CHECK-LABEL: {{^}}store_local_v2i16:
+; SI-CHECK: ds_write_b16
+; SI-CHECK: ds_write_b16
 define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_local_v4i8
+; EG-CHECK-LABEL: {{^}}store_local_v4i8:
 ; EG-CHECK: LDS_WRITE
-; CM-CHECK-LABEL: @store_local_v4i8
+; CM-CHECK-LABEL: {{^}}store_local_v4i8:
 ; CM-CHECK: LDS_WRITE
-; SI-CHECK-LABEL: @store_local_v4i8
-; SI-CHECK: DS_WRITE_B8
-; SI-CHECK: DS_WRITE_B8
-; SI-CHECK: DS_WRITE_B8
-; SI-CHECK: DS_WRITE_B8
+; SI-CHECK-LABEL: {{^}}store_local_v4i8:
+; SI-CHECK: ds_write_b8
+; SI-CHECK: ds_write_b8
+; SI-CHECK: ds_write_b8
+; SI-CHECK: ds_write_b8
 define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_local_v2i32
+; EG-CHECK-LABEL: {{^}}store_local_v2i32:
 ; EG-CHECK: LDS_WRITE
 ; EG-CHECK: LDS_WRITE
-; CM-CHECK-LABEL: @store_local_v2i32
+; CM-CHECK-LABEL: {{^}}store_local_v2i32:
 ; CM-CHECK: LDS_WRITE
 ; CM-CHECK: LDS_WRITE
-; SI-CHECK-LABEL: @store_local_v2i32
-; SI-CHECK: DS_WRITE_B64
+; SI-CHECK-LABEL: {{^}}store_local_v2i32:
+; SI-CHECK: ds_write_b64
 define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_local_v4i32
+; EG-CHECK-LABEL: {{^}}store_local_v4i32:
 ; EG-CHECK: LDS_WRITE
 ; EG-CHECK: LDS_WRITE
 ; EG-CHECK: LDS_WRITE
 ; EG-CHECK: LDS_WRITE
-; CM-CHECK-LABEL: @store_local_v4i32
+; CM-CHECK-LABEL: {{^}}store_local_v4i32:
 ; CM-CHECK: LDS_WRITE
 ; CM-CHECK: LDS_WRITE
 ; CM-CHECK: LDS_WRITE
 ; CM-CHECK: LDS_WRITE
-; SI-CHECK-LABEL: @store_local_v4i32
-; SI-CHECK: DS_WRITE_B32
-; SI-CHECK: DS_WRITE_B32
-; SI-CHECK: DS_WRITE_B32
-; SI-CHECK: DS_WRITE_B32
+; SI-CHECK-LABEL: {{^}}store_local_v4i32:
+; SI-CHECK: ds_write_b32
+; SI-CHECK: ds_write_b32
+; SI-CHECK: ds_write_b32
+; SI-CHECK: ds_write_b32
 define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out
   ret void
 }
 
-; FUNC-LABEL: @store_local_i64_i8
+; FUNC-LABEL: {{^}}store_local_i64_i8:
 ; EG-CHECK: LDS_BYTE_WRITE
-; SI-CHECK: DS_WRITE_B8
+; SI-CHECK: ds_write_b8
 define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
@@ -301,9 +301,9 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @store_local_i64_i16
+; FUNC-LABEL: {{^}}store_local_i64_i16:
 ; EG-CHECK: LDS_SHORT_WRITE
-; SI-CHECK: DS_WRITE_B16
+; SI-CHECK: ds_write_b16
 define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
@@ -318,12 +318,12 @@ entry:
 ; Evergreen / Northern Islands don't support 64-bit stores yet, so there should
 ; be two 32-bit stores.
 
-; EG-CHECK-LABEL: @vecload2
+; EG-CHECK-LABEL: {{^}}vecload2:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: @vecload2
+; CM-CHECK-LABEL: {{^}}vecload2:
 ; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: @vecload2
-; SI-CHECK: BUFFER_STORE_DWORDX2
+; SI-CHECK-LABEL: {{^}}vecload2:
+; SI-CHECK: buffer_store_dwordx2
 define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
   %0 = load i32 addrspace(2)* %mem, align 4
@@ -339,7 +339,7 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 
 ; When i128 was a legal type this program generated cannot select errors:
 
-; FUNC-LABEL: @i128-const-store
+; FUNC-LABEL: {{^}}"i128-const-store":
 ; FIXME: We should be able to to this with one store instruction
 ; EG-CHECK: STORE_RAW
 ; EG-CHECK: STORE_RAW
@@ -349,8 +349,8 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"=
 ; CM-CHECK: STORE_DWORD
 ; CM-CHECK: STORE_DWORD
 ; CM-CHECK: STORE_DWORD
-; SI: BUFFER_STORE_DWORDX2
-; SI: BUFFER_STORE_DWORDX2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
 define void @i128-const-store(i32 addrspace(1)* %out) {
 entry:
   store i32 1, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/store.r600.ll b/test/CodeGen/R600/store.r600.ll
index 00589a0..3df30d4 100644
--- a/test/CodeGen/R600/store.r600.ll
+++ b/test/CodeGen/R600/store.r600.ll
@@ -3,7 +3,7 @@
 ; XXX: Merge this test into store.ll once it is supported on SI
 
 ; v4i32 store
-; EG-CHECK: @store_v4i32
+; EG-CHECK: {{^}}store_v4i32:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
 
 define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
@@ -13,7 +13,7 @@ define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 }
 
 ; v4f32 store
-; EG-CHECK: @store_v4f32
+; EG-CHECK: {{^}}store_v4f32:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
 define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %1 = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/structurize.ll b/test/CodeGen/R600/structurize.ll
index c2acd93..02e592e 100644
--- a/test/CodeGen/R600/structurize.ll
+++ b/test/CodeGen/R600/structurize.ll
@@ -13,7 +13,7 @@
 ;
 ;
 
-; CHECK-LABEL: @branch_into_diamond
+; CHECK-LABEL: {{^}}branch_into_diamond:
 ; === entry block:
 ; CHECK: ALU_PUSH_BEFORE
 ; === Branch instruction (IF):
diff --git a/test/CodeGen/R600/structurize1.ll b/test/CodeGen/R600/structurize1.ll
index 8c10301..77432c1 100644
--- a/test/CodeGen/R600/structurize1.ll
+++ b/test/CodeGen/R600/structurize1.ll
@@ -16,7 +16,7 @@
 ;   }
 ; }
 
-; CHECK-LABEL: @if_inside_loop
+; CHECK-LABEL: {{^}}if_inside_loop:
 ; CHECK: LOOP_START_DX10
 ; CHECK: END_LOOP
 define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
index 8e64148..2bbc0cf 100644
--- a/test/CodeGen/R600/sub.ll
+++ b/test/CodeGen/R600/sub.ll
@@ -3,12 +3,12 @@
 
 declare i32 @llvm.r600.read.tidig.x() readnone
 
-;FUNC-LABEL: @test2
+;FUNC-LABEL: {{^}}test2:
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -19,16 +19,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;FUNC-LABEL: @test4
+;FUNC-LABEL: {{^}}test4:
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,9 +39,9 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   ret void
 }
 
-; FUNC-LABEL: @s_sub_i64:
-; SI: S_SUB_I32
-; SI: S_SUBB_U32
+; FUNC-LABEL: {{^}}s_sub_i64:
+; SI: s_sub_u32
+; SI: s_subb_u32
 
 ; EG-DAG: SETGE_UINT
 ; EG-DAG: CNDE_INT
@@ -54,9 +54,9 @@ define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind
   ret void
 }
 
-; FUNC-LABEL: @v_sub_i64:
-; SI: V_SUB_I32_e32
-; SI: V_SUBB_U32_e32
+; FUNC-LABEL: {{^}}v_sub_i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
 
 ; EG-DAG: SETGE_UINT
 ; EG-DAG: CNDE_INT
diff --git a/test/CodeGen/R600/swizzle-export.ll b/test/CodeGen/R600/swizzle-export.ll
index 16c3f19..3e6f7a7 100644
--- a/test/CodeGen/R600/swizzle-export.ll
+++ b/test/CodeGen/R600/swizzle-export.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 
-;EG-CHECK: @main
+;EG-CHECK: {{^}}main:
 ;EG-CHECK: EXPORT T{{[0-9]+}}.XYXX
 ;EG-CHECK: EXPORT T{{[0-9]+}}.ZXXX
 ;EG-CHECK: EXPORT T{{[0-9]+}}.XXWX
@@ -92,9 +92,9 @@ main_body:
   ret void
 }
 
-; EG-CHECK: @main2
+; EG-CHECK: {{^}}main2:
 ; EG-CHECK: T{{[0-9]+}}.XY__
-; EG-CHECK: T{{[0-9]+}}.YXZ0
+; EG-CHECK: T{{[0-9]+}}.ZXY0
 
 define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
diff --git a/test/CodeGen/R600/trunc-store-i1.ll b/test/CodeGen/R600/trunc-store-i1.ll
index a3975c8..3c1b19f 100644
--- a/test/CodeGen/R600/trunc-store-i1.ll
+++ b/test/CodeGen/R600/trunc-store-i1.ll
@@ -1,30 +1,30 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
-; SI-LABEL: @global_truncstore_i32_to_i1
-; SI: S_LOAD_DWORD [[LOAD:s[0-9]+]],
-; SI: S_AND_B32 [[SREG:s[0-9]+]], [[LOAD]], 1
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], [[SREG]]
-; SI: BUFFER_STORE_BYTE [[VREG]],
+; SI-LABEL: {{^}}global_truncstore_i32_to_i1:
+; SI: s_load_dword [[LOAD:s[0-9]+]],
+; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; SI: buffer_store_byte [[VREG]],
 define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
   %trunc = trunc i32 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: @global_truncstore_i64_to_i1
-; SI: BUFFER_STORE_BYTE
+; SI-LABEL: {{^}}global_truncstore_i64_to_i1:
+; SI: buffer_store_byte
 define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
   %trunc = trunc i64 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: @global_truncstore_i16_to_i1
-; SI: S_LOAD_DWORD [[LOAD:s[0-9]+]],
-; SI: S_AND_B32 [[SREG:s[0-9]+]], [[LOAD]], 1
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], [[SREG]]
-; SI: BUFFER_STORE_BYTE [[VREG]],
+; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
+; SI: s_load_dword [[LOAD:s[0-9]+]],
+; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; SI: buffer_store_byte [[VREG]],
 define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
   %trunc = trunc i16 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
diff --git a/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll b/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll
index ec959c2..878ea3f 100644
--- a/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll
+++ b/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll
@@ -4,7 +4,7 @@
 ; vector stores at the end of a basic block were not being added to the
 ; LegalizedNodes list, which triggered an assertion failure.
 
-; CHECK-LABEL: @test
+; CHECK-LABEL: {{^}}test:
 ; CHECK: MEM_RAT_CACHELESS STORE_RAW
 define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
 entry:
diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll
index 31cdfcd..7519d10 100644
--- a/test/CodeGen/R600/trunc.ll
+++ b/test/CodeGen/R600/trunc.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
 
 define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
-; SI-LABEL: @trunc_i64_to_i32_store
-; SI: S_LOAD_DWORD s0, s[0:1], 0xb
-; SI: V_MOV_B32_e32 v0, s0
-; SI: BUFFER_STORE_DWORD v0
+; SI-LABEL: {{^}}trunc_i64_to_i32_store:
+; SI: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], 0xb
+; SI: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
+; SI: buffer_store_dword [[VLOAD]]
 
-; EG-LABEL: @trunc_i64_to_i32_store
+; EG-LABEL: {{^}}trunc_i64_to_i32_store:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
 ; EG: LSHR
 ; EG-NEXT: 2(
@@ -16,12 +16,12 @@ define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
   ret void
 }
 
-; SI-LABEL: @trunc_load_shl_i64:
-; SI-DAG: S_LOAD_DWORDX2
-; SI-DAG: S_LOAD_DWORD [[SREG:s[0-9]+]],
-; SI: S_LSHL_B32 [[SHL:s[0-9]+]], [[SREG]], 2
-; SI: V_MOV_B32_e32 [[VSHL:v[0-9]+]], [[SHL]]
-; SI: BUFFER_STORE_DWORD [[VSHL]],
+; SI-LABEL: {{^}}trunc_load_shl_i64:
+; SI-DAG: s_load_dwordx2
+; SI-DAG: s_load_dword [[SREG:s[0-9]+]],
+; SI: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2
+; SI: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]]
+; SI: buffer_store_dword [[VSHL]],
 define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
   %b = shl i64 %a, 2
   %result = trunc i64 %b to i32
@@ -29,12 +29,13 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
   ret void
 }
 
-; SI-LABEL: @trunc_shl_i64:
-; SI: S_LOAD_DWORDX2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}},
-; SI: S_ADD_I32 s[[LO_ADD:[0-9]+]], s[[LO_SREG]],
-; SI: S_LSHL_B64 s{{\[}}[[LO_SREG2:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_ADD]]:{{[0-9]+\]}}, 2
-; SI: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
-; SI: BUFFER_STORE_DWORD v[[LO_VREG]],
+; SI-LABEL: {{^}}trunc_shl_i64:
+; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
+; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
+; SI: s_addc_u32
+; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
+; SI: buffer_store_dword v[[LO_VREG]],
 define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
   %aa = add i64 %a, 234 ; Prevent shrinking store.
   %b = shl i64 %aa, 2
@@ -44,10 +45,21 @@ define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64
   ret void
 }
 
-; SI-LABEL: @trunc_i32_to_i1:
-; SI: V_AND_B32
-; SI: V_CMP_EQ_I32
-define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
+; SI-LABEL: {{^}}trunc_i32_to_i1:
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: v_cmp_eq_i32
+define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
+  %a = load i32 addrspace(1)* %ptr, align 4
+  %trunc = trunc i32 %a to i1
+  %result = select i1 %trunc, i32 1, i32 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}sgpr_trunc_i32_to_i1:
+; SI: v_and_b32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}}
+; SI: v_cmp_eq_i32
+define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
   store i32 %result, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/uaddo.ll b/test/CodeGen/R600/uaddo.ll
index a80e502..eb242c1 100644
--- a/test/CodeGen/R600/uaddo.ll
+++ b/test/CodeGen/R600/uaddo.ll
@@ -4,10 +4,10 @@
 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
 
-; FUNC-LABEL: @uaddo_i64_zext
-; SI: ADD
-; SI: ADDC
-; SI: ADDC
+; FUNC-LABEL: {{^}}uaddo_i64_zext:
+; SI: add
+; SI: addc
+; SI: addc
 define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %uadd, 0
@@ -18,8 +18,8 @@ define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @s_uaddo_i32
-; SI: S_ADD_I32
+; FUNC-LABEL: {{^}}s_uaddo_i32:
+; SI: s_add_i32
 define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %uadd, 0
@@ -29,8 +29,8 @@ define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
   ret void
 }
 
-; FUNC-LABEL: @v_uaddo_i32
-; SI: V_ADD_I32
+; FUNC-LABEL: {{^}}v_uaddo_i32:
+; SI: v_add_i32
 define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32 addrspace(1)* %aptr, align 4
   %b = load i32 addrspace(1)* %bptr, align 4
@@ -42,9 +42,9 @@ define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
   ret void
 }
 
-; FUNC-LABEL: @s_uaddo_i64
-; SI: S_ADD_I32
-; SI: S_ADDC_U32
+; FUNC-LABEL: {{^}}s_uaddo_i64:
+; SI: s_add_u32
+; SI: s_addc_u32
 define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %uadd, 0
@@ -54,9 +54,9 @@ define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
   ret void
 }
 
-; FUNC-LABEL: @v_uaddo_i64
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
+; FUNC-LABEL: {{^}}v_uaddo_i64:
+; SI: v_add_i32
+; SI: v_addc_u32
 define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64 addrspace(1)* %aptr, align 4
   %b = load i64 addrspace(1)* %bptr, align 4
diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
index 5371321..59e91f8 100644
--- a/test/CodeGen/R600/udiv.ll
+++ b/test/CodeGen/R600/udiv.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK-LABEL: @test
+;EG-CHECK-LABEL: {{^}}test:
 ;EG-CHECK-NOT: SETGE_INT
 ;EG-CHECK: CF_END
 
@@ -18,10 +18,10 @@ define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ;The goal of this test is to make sure the ISel doesn't fail when it gets
 ;a v4i32 udiv
 
-;EG-CHECK-LABEL: @test2
+;EG-CHECK-LABEL: {{^}}test2:
 ;EG-CHECK: CF_END
-;SI-CHECK-LABEL: @test2
-;SI-CHECK: S_ENDPGM
+;SI-CHECK-LABEL: {{^}}test2:
+;SI-CHECK: s_endpgm
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -32,10 +32,10 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK-LABEL: @test4
+;EG-CHECK-LABEL: {{^}}test4:
 ;EG-CHECK: CF_END
-;SI-CHECK-LABEL: @test4
-;SI-CHECK: S_ENDPGM
+;SI-CHECK-LABEL: {{^}}test4:
+;SI-CHECK: s_endpgm
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/udivrem.ll b/test/CodeGen/R600/udivrem.ll
index 5f5753a..f20705b 100644
--- a/test/CodeGen/R600/udivrem.ll
+++ b/test/CodeGen/R600/udivrem.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
-; FUNC-LABEL: @test_udivrem
+; FUNC-LABEL: {{^}}test_udivrem:
 ; EG: RECIP_UINT
 ; EG-DAG: MULHI
 ; EG-DAG: MULLO_INT
@@ -26,30 +26,30 @@
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 
-; SI: V_RCP_IFLAG_F32_e32 [[RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[RCP_HI:v[0-9]+]], [[RCP]]
-; SI-DAG: V_MUL_LO_I32 [[RCP_LO:v[0-9]+]], [[RCP]]
-; SI-DAG: V_SUB_I32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]]
-; SI: V_CNDMASK_B32_e64
-; SI: V_MUL_HI_U32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]]
-; SI-DAG: V_ADD_I32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]]
-; SI: V_CNDMASK_B32_e64
-; SI: V_MUL_HI_U32 [[Quotient:v[0-9]+]]
-; SI: V_MUL_LO_I32 [[Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI: V_AND_B32_e32 [[Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI: S_ENDPGM
+; SI: v_rcp_iflag_f32_e32 [[RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]]
+; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]]
+; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]]
+; SI: v_cndmask_b32_e64
+; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]]
+; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[RCP]], [[E]]
+; SI-DAG: v_sub_i32_e32 [[RCP_S_E:v[0-9]+]], [[RCP]], [[E]]
+; SI: v_cndmask_b32_e64
+; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]]
+; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: s_endpgm
 define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
   %result0 = udiv i32 %x, %y
   store i32 %result0, i32 addrspace(1)* %out
@@ -58,7 +58,7 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
   ret void
 }
 
-; FUNC-LABEL: @test_udivrem_v2
+; FUNC-LABEL: {{^}}test_udivrem_v2:
 ; EG-DAG: RECIP_UINT
 ; EG-DAG: MULHI
 ; EG-DAG: MULLO_INT
@@ -106,53 +106,53 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 
-; SI-DAG: V_RCP_IFLAG_F32_e32 [[FIRST_RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: V_MUL_LO_I32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: V_SUB_I32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
-; SI-DAG: V_ADD_I32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[FIRST_Quotient:v[0-9]+]]
-; SI-DAG: V_MUL_LO_I32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_AND_B32_e32 [[FIRST_Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_RCP_IFLAG_F32_e32 [[SECOND_RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: V_MUL_LO_I32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: V_SUB_I32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
-; SI-DAG: V_ADD_I32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[SECOND_Quotient:v[0-9]+]]
-; SI-DAG: V_MUL_LO_I32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_AND_B32_e32 [[SECOND_Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI: S_ENDPGM
+; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
+; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
+; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: s_endpgm
 define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
   %result0 = udiv <2 x i32> %x, %y
   store <2 x i32> %result0, <2 x i32> addrspace(1)* %out
@@ -162,7 +162,7 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 }
 
 
-; FUNC-LABEL: @test_udivrem_v4
+; FUNC-LABEL: {{^}}test_udivrem_v4:
 ; EG-DAG: RECIP_UINT
 ; EG-DAG: MULHI
 ; EG-DAG: MULLO_INT
@@ -256,99 +256,99 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 
-; SI-DAG: V_RCP_IFLAG_F32_e32 [[FIRST_RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: V_MUL_LO_I32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: V_SUB_I32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
-; SI-DAG: V_ADD_I32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[FIRST_Quotient:v[0-9]+]]
-; SI-DAG: V_MUL_LO_I32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_AND_B32_e32 [[FIRST_Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_RCP_IFLAG_F32_e32 [[SECOND_RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: V_MUL_LO_I32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: V_SUB_I32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
-; SI-DAG: V_ADD_I32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[SECOND_Quotient:v[0-9]+]]
-; SI-DAG: V_MUL_LO_I32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_AND_B32_e32 [[SECOND_Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_RCP_IFLAG_F32_e32 [[THIRD_RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]]
-; SI-DAG: V_MUL_LO_I32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]]
-; SI-DAG: V_SUB_I32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]]
-; SI-DAG: V_ADD_I32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[THIRD_Quotient:v[0-9]+]]
-; SI-DAG: V_MUL_LO_I32 [[THIRD_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[THIRD_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[THIRD_Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_AND_B32_e32 [[THIRD_Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[THIRD_Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_RCP_IFLAG_F32_e32 [[FOURTH_RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]]
-; SI-DAG: V_MUL_LO_I32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]]
-; SI-DAG: V_SUB_I32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]]
-; SI-DAG: V_ADD_I32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[FOURTH_Quotient:v[0-9]+]]
-; SI-DAG: V_MUL_LO_I32 [[FOURTH_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[FOURTH_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FOURTH_Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_AND_B32_e32 [[FOURTH_Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[FOURTH_Quotient_A_One:v[0-9]+]], {{.*}}, [[FOURTH_Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[FOURTH_Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[FOURTH_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[FOURTH_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI: S_ENDPGM
+; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
+; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
+; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[THIRD_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]]
+; SI-DAG: v_mul_lo_i32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]]
+; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]]
+; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_RCP]], [[THIRD_E]]
+; SI-DAG: v_sub_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_RCP]], [[THIRD_E]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[THIRD_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[THIRD_Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[THIRD_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[FOURTH_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]]
+; SI-DAG: v_mul_lo_i32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]]
+; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]]
+; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_RCP]], [[FOURTH_E]]
+; SI-DAG: v_sub_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_RCP]], [[FOURTH_E]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FOURTH_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[FOURTH_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[FOURTH_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FOURTH_Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[FOURTH_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[FOURTH_Quotient_A_One:v[0-9]+]], {{.*}}, [[FOURTH_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[FOURTH_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[FOURTH_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[FOURTH_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: s_endpgm
 define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
   %result0 = udiv <4 x i32> %x, %y
   store <4 x i32> %result0, <4 x i32> addrspace(1)* %out
diff --git a/test/CodeGen/R600/udivrem24.ll b/test/CodeGen/R600/udivrem24.ll
new file mode 100644
index 0000000..defb3c0
--- /dev/null
+++ b/test/CodeGen/R600/udivrem24.ll
@@ -0,0 +1,244 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}udiv24_i8:
+; SI: v_cvt_f32_ubyte
+; SI: v_cvt_f32_ubyte
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
+  %num = load i8 addrspace(1) * %in
+  %den = load i8 addrspace(1) * %den_ptr
+  %result = udiv i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv24_i16:
+; SI: v_cvt_f32_u32
+; SI: v_cvt_f32_u32
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
+  %num = load i16 addrspace(1) * %in, align 2
+  %den = load i16 addrspace(1) * %den_ptr, align 2
+  %result = udiv i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv24_i32:
+; SI: v_cvt_f32_u32
+; SI-DAG: v_cvt_f32_u32
+; SI-DAG: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv25_i32:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_udiv24_i32_1:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_udiv24_i32_2:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem24_i8:
+; SI: v_cvt_f32_ubyte
+; SI: v_cvt_f32_ubyte
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
+  %num = load i8 addrspace(1) * %in
+  %den = load i8 addrspace(1) * %den_ptr
+  %result = urem i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem24_i16:
+; SI: v_cvt_f32_u32
+; SI: v_cvt_f32_u32
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
+  %num = load i16 addrspace(1) * %in, align 2
+  %den = load i16 addrspace(1) * %den_ptr, align 2
+  %result = urem i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem24_i32:
+; SI: v_cvt_f32_u32
+; SI: v_cvt_f32_u32
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem25_i32:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_urem24_i32_1:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_urem24_i32_2:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll
index a71315a..8864c83 100644
--- a/test/CodeGen/R600/udivrem64.ll
+++ b/test/CodeGen/R600/udivrem64.ll
@@ -1,7 +1,7 @@
 ;XUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
-;FUNC-LABEL: @test_udiv
+;FUNC-LABEL: {{^}}test_udiv:
 ;EG: RECIP_UINT
 ;EG: LSHL {{.*}}, 1,
 ;EG: BFE_UINT
@@ -34,14 +34,14 @@
 ;EG: BFE_UINT
 ;EG: BFE_UINT
 ;EG: BFE_UINT
-;SI: S_ENDPGM
+;SI: s_endpgm
 define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = udiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-;FUNC-LABEL: @test_urem
+;FUNC-LABEL: {{^}}test_urem:
 ;EG: RECIP_UINT
 ;EG: BFE_UINT
 ;EG: BFE_UINT
@@ -74,7 +74,7 @@ define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;EG: BFE_UINT
 ;EG: BFE_UINT
 ;EG: AND_INT {{.*}}, 1,
-;SI: S_ENDPGM
+;SI: s_endpgm
 define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll
index 9a41796..bddf700 100644
--- a/test/CodeGen/R600/uint_to_fp.f64.ll
+++ b/test/CodeGen/R600/uint_to_fp.f64.ll
@@ -1,22 +1,24 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @uint_to_fp_f64_i32
-; SI: V_CVT_F64_U32_e32
-; SI: S_ENDPGM
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: {{^}}uint_to_fp_f64_i32
+; SI: v_cvt_f64_u32_e32
+; SI: s_endpgm
 define void @uint_to_fp_f64_i32(double addrspace(1)* %out, i32 %in) {
   %cast = uitofp i32 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @uint_to_fp_i1_f64:
-; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-LABEL: {{^}}uint_to_fp_i1_f64:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
 ; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
 ; we should be able to fold the SGPRs into the V_CNDMASK instructions.
-; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
 define void @uint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to double
@@ -24,13 +26,50 @@ define void @uint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
   ret void
 }
 
-; SI-LABEL: @uint_to_fp_i1_f64_load:
-; SI: V_CNDMASK_B32_e64 [[IRESULT:v[0-9]]], 0, 1
-; SI-NEXT: V_CVT_F64_U32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]]
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}uint_to_fp_i1_f64_load:
+; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, 1
+; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
 define void @uint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
   %fp = uitofp i1 %in to double
   store double %fp, double addrspace(1)* %out, align 8
   ret void
 }
+
+; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64
+; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI-DAG: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
+; SI-DAG: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
+; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr i64 addrspace(1)* %in, i32 %tid
+  %val = load i64 addrspace(1)* %gep, align 8
+  %result = uitofp i64 %val to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_f64_i64
+define void @s_uint_to_fp_f64_i64(double addrspace(1)* %out, i64 %in) {
+  %cast = uitofp i64 %in to double
+  store double %cast, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v2f64_v2i64
+define void @s_uint_to_fp_v2f64_v2i64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
+  %cast = uitofp <2 x i64> %in to <2 x double>
+  store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v4f64_v4i64
+define void @s_uint_to_fp_v4f64_v4i64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
+  %cast = uitofp <4 x i64> %in to <4 x double>
+  store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index 8f5d42d..f58f10b 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll
@@ -1,30 +1,30 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; FUNC-LABEL: @uint_to_fp_v2i32
+; FUNC-LABEL: {{^}}uint_to_fp_v2i32:
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
 
-; SI: V_CVT_F32_U32_e32
-; SI: V_CVT_F32_U32_e32
-; SI: S_ENDPGM
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: s_endpgm
 define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
   %result = uitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @uint_to_fp_v4i32
+; FUNC-LABEL: {{^}}uint_to_fp_v4i32:
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: V_CVT_F32_U32_e32
-; SI: V_CVT_F32_U32_e32
-; SI: V_CVT_F32_U32_e32
-; SI: V_CVT_F32_U32_e32
-; SI: S_ENDPGM
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: s_endpgm
 define void @uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %value = load <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
@@ -32,14 +32,14 @@ define void @uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspac
   ret void
 }
 
-; FUNC-LABEL: @uint_to_fp_i64_f32
+; FUNC-LABEL: {{^}}uint_to_fp_i64_f32:
 ; R600: UINT_TO_FLT
 ; R600: UINT_TO_FLT
 ; R600: MULADD_IEEE
-; SI: V_CVT_F32_U32_e32
-; SI: V_CVT_F32_U32_e32
-; SI: V_MAD_F32
-; SI: S_ENDPGM
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_mad_f32
+; SI: s_endpgm
 define void @uint_to_fp_i64_f32(float addrspace(1)* %out, i64 %in) {
 entry:
   %0 = uitofp i64 %in to float
@@ -47,11 +47,11 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @uint_to_fp_i1_f32:
-; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; SI-NEXT: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, 1.000000e+00, [[CMP]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}uint_to_fp_i1_f32:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @uint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
@@ -59,10 +59,10 @@ define void @uint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
   ret void
 }
 
-; FUNC-LABEL: @uint_to_fp_i1_f32_load:
-; SI: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, 1.000000e+00
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}uint_to_fp_i1_f32_load:
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
   %fp = uitofp i1 %in to float
   store float %fp, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll
index 4df69d1..f8737e6 100644
--- a/test/CodeGen/R600/unaligned-load-store.ll
+++ b/test/CodeGen/R600/unaligned-load-store.ll
@@ -1,17 +1,101 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @unaligned_load_store_i32:
-; DS_READ_U32 {{v[0-9]+}}, 0, [[REG]]
+; FIXME: This is probably wrong. This probably needs to expand to 8-bit reads and writes.
+; SI-LABEL: {{^}}unaligned_load_store_i32:
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_write_b32
+; SI: s_endpgm
 define void @unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
   %v = load i32 addrspace(3)* %p, align 1
   store i32 %v, i32 addrspace(3)* %r, align 1
   ret void
 }
 
-; SI-LABEL: @unaligned_load_store_v4i32:
-; DS_READ_U32 {{v[0-9]+}}, 0, [[REG]]
+; SI-LABEL: {{^}}unaligned_load_store_v4i32:
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: s_endpgm
 define void @unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
   %v = load <4 x i32> addrspace(3)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
   ret void
 }
+
+; SI-LABEL: {{^}}load_lds_i64_align_4:
+; SI: ds_read2_b32
+; SI: s_endpgm
+define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %val = load i64 addrspace(3)* %in, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
+; SI: s_endpgm
+define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %ptr = getelementptr i64 addrspace(3)* %in, i32 4
+  %val = load i64 addrspace(3)* %ptr, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset:
+; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:0 offset1:1
+; SI: s_endpgm
+define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
+  %ptr255 = getelementptr i32 addrspace(3)* %ptr, i32 255
+  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
+  %val = load i64 addrspace(3)* %ptri64, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: Need to fix this case.
+; define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+;   %val = load i64 addrspace(3)* %in, align 1
+;   store i64 %val, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; SI-LABEL: {{^}}store_lds_i64_align_4:
+; SI: ds_write2_b32
+; SI: s_endpgm
+define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
+  store i64 %val, i64 addrspace(3)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset
+; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
+; SI: s_endpgm
+define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
+  %ptr = getelementptr i64 addrspace(3)* %out, i32 4
+  store i64 0, i64 addrspace(3)* %ptr, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset:
+; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
+; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: s_endpgm
+define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
+  %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
+  %ptr255 = getelementptr i32 addrspace(3)* %ptr, i32 255
+  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
+  store i64 0, i64 addrspace(3)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll b/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
index e4129c5..ff01a1e 100644
--- a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
+++ b/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
@@ -5,7 +5,7 @@
 
 ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
 
-; COMMON-LABEL: @branch_true:
+; COMMON-LABEL: {{^}}branch_true:
 define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 true, label %for.end, label %for.body.lr.ph
@@ -39,9 +39,9 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-; COMMON-LABEL: @branch_false:
+; COMMON-LABEL: {{^}}branch_false:
 ; SI: .text
-; SI-NEXT: S_ENDPGM
+; SI-NEXT: s_endpgm
 define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 false, label %for.end, label %for.body.lr.ph
@@ -75,9 +75,9 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-; COMMON-LABEL: @branch_undef:
+; COMMON-LABEL: {{^}}branch_undef:
 ; SI: .text
-; SI-NEXT: S_ENDPGM
+; SI-NEXT: s_endpgm
 define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 undef, label %for.end, label %for.body.lr.ph
diff --git a/test/CodeGen/R600/unsupported-cc.ll b/test/CodeGen/R600/unsupported-cc.ll
index f986a02..8ab4faf 100644
--- a/test/CodeGen/R600/unsupported-cc.ll
+++ b/test/CodeGen/R600/unsupported-cc.ll
@@ -2,7 +2,7 @@
 
 ; These tests are for condition codes that are not supported by the hardware
 
-; CHECK-LABEL: @slt
+; CHECK-LABEL: {{^}}slt:
 ; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5(7.006492e-45)
@@ -14,7 +14,7 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @ult_i32
+; CHECK-LABEL: {{^}}ult_i32:
 ; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5(7.006492e-45)
@@ -26,7 +26,7 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @ult_float
+; CHECK-LABEL: {{^}}ult_float:
 ; CHECK: SETGE * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
@@ -39,7 +39,7 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @ult_float_native
+; CHECK-LABEL: {{^}}ult_float_native:
 ; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
 ; CHECK-NEXT: LSHR *
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -51,7 +51,7 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @olt
+; CHECK-LABEL: {{^}}olt:
 ; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: LSHR *
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -63,7 +63,7 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @sle
+; CHECK-LABEL: {{^}}sle:
 ; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 6(8.407791e-45)
@@ -75,7 +75,7 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @ule_i32
+; CHECK-LABEL: {{^}}ule_i32:
 ; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 6(8.407791e-45)
@@ -87,7 +87,7 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @ule_float
+; CHECK-LABEL: {{^}}ule_float:
 ; CHECK: SETGT * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
@@ -100,7 +100,7 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @ule_float_native
+; CHECK-LABEL: {{^}}ule_float_native:
 ; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
 ; CHECK-NEXT: LSHR *
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -112,7 +112,7 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @ole
+; CHECK-LABEL: {{^}}ole:
 ; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: LSHR *
 ; CHECK-NEXT:1084227584(5.000000e+00)
diff --git a/test/CodeGen/R600/urecip.ll b/test/CodeGen/R600/urecip.ll
index e808e3d..4d953b5 100644
--- a/test/CodeGen/R600/urecip.ll
+++ b/test/CodeGen/R600/urecip.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_RCP_IFLAG_F32_e32
+;CHECK: v_rcp_iflag_f32_e32
 
 define void @test(i32 %p, i32 %q) {
    %i = udiv i32 %p, %q
diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll
index 8045145..914f5d0 100644
--- a/test/CodeGen/R600/urem.ll
+++ b/test/CodeGen/R600/urem.ll
@@ -5,10 +5,10 @@
 ;The goal of this test is to make sure the ISel doesn't fail when it gets
 ;a v2i32/v4i32 urem
 
-;EG-CHECK: @test2
+;EG-CHECK: {{^}}test2:
 ;EG-CHECK: CF_END
-;SI-CHECK: @test2
-;SI-CHECK: S_ENDPGM
+;SI-CHECK: {{^}}test2:
+;SI-CHECK: s_endpgm
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -19,10 +19,10 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK: @test4
+;EG-CHECK: {{^}}test4:
 ;EG-CHECK: CF_END
-;SI-CHECK: @test4
-;SI-CHECK: S_ENDPGM
+;SI-CHECK: {{^}}test4:
+;SI-CHECK: s_endpgm
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/use-sgpr-multiple-times.ll b/test/CodeGen/R600/use-sgpr-multiple-times.ll
new file mode 100644
index 0000000..aa94a0e
--- /dev/null
+++ b/test/CodeGen/R600/use-sgpr-multiple-times.ll
@@ -0,0 +1,96 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1
+
+
+; SI-LABEL: {{^}}test_sgpr_use_twice_binop:
+; SI: s_load_dword [[SGPR:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
+  %dbl = fadd float %a, %a
+  store float %dbl, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_sgpr_use_three_ternary_op:
+; SI: s_load_dword [[SGPR:s[0-9]+]],
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
+; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
+; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
+; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm:
+; SI: s_load_dword [[SGPR:s[0-9]+]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
+; SI: s_load_dword [[SGPR:s[0-9]+]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Don't use fma since fma c, x, y is canonicalized to fma x, c, y
+; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a:
+; SI: s_load_dword [[SGPR:s[0-9]+]]
+; SI: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 {
+  %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1
+  store i32 %fma, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/usubo.ll b/test/CodeGen/R600/usubo.ll
index d57a2c7..abc5bd2 100644
--- a/test/CodeGen/R600/usubo.ll
+++ b/test/CodeGen/R600/usubo.ll
@@ -4,7 +4,7 @@
 declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
 
-; FUNC-LABEL: @usubo_i64_zext
+; FUNC-LABEL: {{^}}usubo_i64_zext:
 define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %usub, 0
@@ -15,8 +15,8 @@ define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   ret void
 }
 
-; FUNC-LABEL: @s_usubo_i32
-; SI: S_SUB_I32
+; FUNC-LABEL: {{^}}s_usubo_i32:
+; SI: s_sub_i32
 define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %usub, 0
@@ -26,8 +26,8 @@ define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
   ret void
 }
 
-; FUNC-LABEL: @v_usubo_i32
-; SI: V_SUBREV_I32_e32
+; FUNC-LABEL: {{^}}v_usubo_i32:
+; SI: v_sub_i32_e32
 define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32 addrspace(1)* %aptr, align 4
   %b = load i32 addrspace(1)* %bptr, align 4
@@ -39,9 +39,9 @@ define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
   ret void
 }
 
-; FUNC-LABEL: @s_usubo_i64
-; SI: S_SUB_I32
-; SI: S_SUBB_U32
+; FUNC-LABEL: {{^}}s_usubo_i64:
+; SI: s_sub_u32
+; SI: s_subb_u32
 define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %usub, 0
@@ -51,9 +51,9 @@ define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
   ret void
 }
 
-; FUNC-LABEL: @v_usubo_i64
-; SI: V_SUB_I32
-; SI: V_SUBB_U32
+; FUNC-LABEL: {{^}}v_usubo_i64:
+; SI: v_sub_i32
+; SI: v_subb_u32
 define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64 addrspace(1)* %aptr, align 4
   %b = load i64 addrspace(1)* %bptr, align 4
diff --git a/test/CodeGen/R600/v1i64-kernel-arg.ll b/test/CodeGen/R600/v1i64-kernel-arg.ll
index 2aa1221..3175512 100644
--- a/test/CodeGen/R600/v1i64-kernel-arg.ll
+++ b/test/CodeGen/R600/v1i64-kernel-arg.ll
@@ -2,14 +2,14 @@
 ; XFAIL: *
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
 
-; CHECK-LABEL: @kernel_arg_i64
+; CHECK-LABEL: {{^}}kernel_arg_i64:
 define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; i64 arg works, v1i64 arg does not.
-; CHECK-LABEL: @kernel_arg_v1i64
+; CHECK-LABEL: {{^}}kernel_arg_v1i64:
 define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/R600/v_cndmask.ll b/test/CodeGen/R600/v_cndmask.ll
index 84087ee..a24dcc7 100644
--- a/test/CodeGen/R600/v_cndmask.ll
+++ b/test/CodeGen/R600/v_cndmask.ll
@@ -1,14 +1,38 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI: @v_cnd_nan
-; SI: V_CNDMASK_B32_e64 v{{[0-9]}},
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; SI-LABEL: {{^}}v_cnd_nan_nosgpr:
+; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}}
 ; SI-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
-; SI-DAG: -1
-define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) {
-entry:
-  %0 = icmp ne i32 %c, 0
-  %1 = select i1 %0, float 0xFFFFFFFFE0000000, float %f
-  store float %1, float addrspace(1)* %out
+; SI: s_endpgm
+define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
+  %idx = call i32 @llvm.r600.read.tidig.x() #1
+  %f.gep = getelementptr float addrspace(1)* %fptr, i32 %idx
+  %f = load float addrspace(1)* %fptr
+  %setcc = icmp ne i32 %c, 0
+  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
+  store float %select, float addrspace(1)* %out
   ret void
 }
+
+
+; This requires slightly trickier SGPR operand legalization since the
+; single constant bus SGPR usage is the last operand, and it should
+; never be moved.
+
+; SI-LABEL: {{^}}v_cnd_nan:
+; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}}
+; SI-DAG: v{{[0-9]}}
+; All nan values are converted to 0xffffffff
+; SI: s_endpgm
+define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
+  %setcc = icmp ne i32 %c, 0
+  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
+  store float %select, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/valu-i1.ll b/test/CodeGen/R600/valu-i1.ll
index 5d5e3ff..2c209fc 100644
--- a/test/CodeGen/R600/valu-i1.ll
+++ b/test/CodeGen/R600/valu-i1.ll
@@ -2,8 +2,8 @@
 
 ; Make sure the i1 values created by the cfg structurizer pass are
 ; moved using VALU instructions
-; SI-NOT: S_MOV_B64 s[{{[0-9]:[0-9]}}], -1
-; SI: V_MOV_B32_e32 v{{[0-9]}}, -1
+; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
+; SI: v_mov_b32_e32 v{{[0-9]}}, -1
 define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) {
 entry:
   switch i32 %a, label %default [
diff --git a/test/CodeGen/R600/vector-alloca.ll b/test/CodeGen/R600/vector-alloca.ll
index 6543f6d..0b457a8 100644
--- a/test/CodeGen/R600/vector-alloca.ll
+++ b/test/CodeGen/R600/vector-alloca.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @vector_read
+; FUNC-LABEL: {{^}}vector_read:
 ; EG: MOV
 ; EG: MOV
 ; EG: MOV
@@ -24,7 +25,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: @vector_write
+; FUNC-LABEL: {{^}}vector_write:
 ; EG: MOV
 ; EG: MOV
 ; EG: MOV
@@ -52,8 +53,8 @@ entry:
 
 ; This test should be optimize to:
 ; store i32 0, i32 addrspace(1)* %out
-; FUNC-LABEL: @bitcast_gep
-; CHECK: STORE_RAW
+; FUNC-LABEL: {{^}}bitcast_gep:
+; EG: STORE_RAW
 define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %0 = alloca [4 x i32]
diff --git a/test/CodeGen/R600/vertex-fetch-encoding.ll b/test/CodeGen/R600/vertex-fetch-encoding.ll
index 7ea7a5c..e24744e 100644
--- a/test/CodeGen/R600/vertex-fetch-encoding.ll
+++ b/test/CodeGen/R600/vertex-fetch-encoding.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI-CHECK %s
 ; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s
 
-; NI-CHECK: @vtx_fetch32
+; NI-CHECK: {{^}}vtx_fetch32:
 ; NI-CHECK: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
-; CM-CHECK: @vtx_fetch32
+; CM-CHECK: {{^}}vtx_fetch32:
 ; CM-CHECK: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
 define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -13,7 +13,7 @@ entry:
   ret void
 }
 
-; NI-CHECK: @vtx_fetch128
+; NI-CHECK: {{^}}vtx_fetch128:
 ; NI-CHECK: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
 ; XXX: Add a case for Cayman when v4i32 stores are supported.
 
diff --git a/test/CodeGen/R600/vop-shrink.ll b/test/CodeGen/R600/vop-shrink.ll
new file mode 100644
index 0000000..e7f0288
--- /dev/null
+++ b/test/CodeGen/R600/vop-shrink.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; Test that we correctly commute a sub instruction
+; FUNC-LABEL: {{^}}sub_rev:
+; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, s
+; SI: v_subrev_i32_e32 v{{[0-9]+}}, s
+
+; ModuleID = 'vop-shrink.ll'
+
+define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
+entry:
+  %vgpr = call i32 @llvm.r600.read.tidig.x() #1
+  %tmp = icmp eq i32 %cond, 0
+  br i1 %tmp, label %if, label %else
+
+if:                                               ; preds = %entry
+  %tmp1 = getelementptr i32 addrspace(1)* %out, i32 1
+  %tmp2 = extractelement <4 x i32> %sgpr, i32 1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  br label %endif
+
+else:                                             ; preds = %entry
+  %tmp3 = extractelement <4 x i32> %sgpr, i32 2
+  %tmp4 = sub i32 %vgpr, %tmp3
+  store i32 %tmp4, i32 addrspace(1)* %out
+  br label %endif
+
+endif:                                            ; preds = %else, %if
+  ret void
+}
+
+; Test that we fold an immediate that was illegal for a 64-bit op into the
+; 32-bit op when we shrink it.
+
+; FUNC-LABEL: {{^}}add_fold:
+; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
+define void @add_fold(float addrspace(1)* %out) {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = uitofp i32 %tmp to float
+  %tmp2 = fadd float %tmp1, 1.024000e+03
+  store float %tmp2, float addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/vselect.ll b/test/CodeGen/R600/vselect.ll
index dca7b06..e84b8f7 100644
--- a/test/CodeGen/R600/vselect.ll
+++ b/test/CodeGen/R600/vselect.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK: @test_select_v2i32
+;EG-CHECK: {{^}}test_select_v2i32:
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test_select_v2i32
-;SI-CHECK: V_CNDMASK_B32_e64
-;SI-CHECK: V_CNDMASK_B32_e64
+;SI-CHECK: {{^}}test_select_v2i32:
+;SI-CHECK: v_cndmask_b32_e64
+;SI-CHECK: v_cndmask_b32_e64
 
 define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
 entry:
@@ -19,13 +19,13 @@ entry:
   ret void
 }
 
-;EG-CHECK: @test_select_v2f32
+;EG-CHECK: {{^}}test_select_v2f32:
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test_select_v2f32
-;SI-CHECK: V_CNDMASK_B32_e64
-;SI-CHECK: V_CNDMASK_B32_e64
+;SI-CHECK: {{^}}test_select_v2f32:
+;SI-CHECK: v_cndmask_b32_e64
+;SI-CHECK: v_cndmask_b32_e64
 
 define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
 entry:
@@ -37,17 +37,17 @@ entry:
   ret void
 }
 
-;EG-CHECK: @test_select_v4i32
+;EG-CHECK: {{^}}test_select_v4i32:
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test_select_v4i32
-;SI-CHECK: V_CNDMASK_B32_e64
-;SI-CHECK: V_CNDMASK_B32_e64
-;SI-CHECK: V_CNDMASK_B32_e64
-;SI-CHECK: V_CNDMASK_B32_e64
+;SI-CHECK: {{^}}test_select_v4i32:
+;SI-CHECK: v_cndmask_b32_e64
+;SI-CHECK: v_cndmask_b32_e64
+;SI-CHECK: v_cndmask_b32_e64
+;SI-CHECK: v_cndmask_b32_e64
 
 define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
 entry:
@@ -59,7 +59,7 @@ entry:
   ret void
 }
 
-;EG-CHECK: @test_select_v4f32
+;EG-CHECK: {{^}}test_select_v4f32:
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/vselect64.ll b/test/CodeGen/R600/vselect64.ll
index 604695b..ef85ebe 100644
--- a/test/CodeGen/R600/vselect64.ll
+++ b/test/CodeGen/R600/vselect64.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck  %s
 ; XXX: Merge this test into vselect.ll once SI supports 64-bit select.
 
-; CHECK-LABEL: @test_select_v4i64
+; CHECK-LABEL: {{^}}test_select_v4i64:
 ; Make sure the vectors aren't being stored on the stack.  We know they are
 ; being stored on the stack if the shaders uses at leat 10 registers.
 ; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X
diff --git a/test/CodeGen/R600/vtx-fetch-branch.ll b/test/CodeGen/R600/vtx-fetch-branch.ll
index 0fc99de..bcbe34e 100644
--- a/test/CodeGen/R600/vtx-fetch-branch.ll
+++ b/test/CodeGen/R600/vtx-fetch-branch.ll
@@ -6,7 +6,7 @@
 ; after the fetch clause.
 
 
-; CHECK-LABEL: @test
+; CHECK-LABEL: {{^}}test:
 ; CHECK-NOT: ALU_POP_AFTER
 ; CHECK: TEX
 ; CHECK-NEXT: POP
diff --git a/test/CodeGen/R600/vtx-schedule.ll b/test/CodeGen/R600/vtx-schedule.ll
index ce852c5..8254c99 100644
--- a/test/CodeGen/R600/vtx-schedule.ll
+++ b/test/CodeGen/R600/vtx-schedule.ll
@@ -4,7 +4,7 @@
 ; the result of another VTX_READ instruction were being grouped in the
 ; same fetch clasue.
 
-; CHECK: @test
+; CHECK: {{^}}test:
 ; CHECK: Fetch clause
 ; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
 ; CHECK: Fetch clause
diff --git a/test/CodeGen/R600/wait.ll b/test/CodeGen/R600/wait.ll
index 2cf88fe..735eabd 100644
--- a/test/CodeGen/R600/wait.ll
+++ b/test/CodeGen/R600/wait.ll
@@ -1,37 +1,45 @@
-; RUN: llc < %s -march=r600 -mcpu=SI --verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
 
-;CHECK-LABEL: @main
-;CHECK: S_WAITCNT lgkmcnt(0)
-;CHECK: S_WAITCNT vmcnt(0)
-;CHECK: S_WAITCNT expcnt(0) lgkmcnt(0)
-
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, i32 inreg, i32, i32, i32, i32) #0 {
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_load_dwordx4
+; CHECK: s_load_dwordx4
+; CHECK: s_waitcnt lgkmcnt(0){{$}}
+; CHECK: s_waitcnt vmcnt(0){{$}}
+; CHECK: s_waitcnt expcnt(0) lgkmcnt(0){{$}}
+define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
 main_body:
-  %10 = getelementptr <16 x i8> addrspace(2)* %3, i32 0
-  %11 = load <16 x i8> addrspace(2)* %10, !tbaa !0
-  %12 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %11, i32 0, i32 %6)
-  %13 = extractelement <4 x float> %12, i32 0
-  %14 = extractelement <4 x float> %12, i32 1
-  %15 = extractelement <4 x float> %12, i32 2
-  %16 = extractelement <4 x float> %12, i32 3
-  %17 = getelementptr <16 x i8> addrspace(2)* %3, i32 1
-  %18 = load <16 x i8> addrspace(2)* %17, !tbaa !0
-  %19 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %18, i32 0, i32 %6)
-  %20 = extractelement <4 x float> %19, i32 0
-  %21 = extractelement <4 x float> %19, i32 1
-  %22 = extractelement <4 x float> %19, i32 2
-  %23 = extractelement <4 x float> %19, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %20, float %21, float %22, float %23)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %13, float %14, float %15, float %16)
+  %tmp = getelementptr <16 x i8> addrspace(2)* %arg3, i32 0
+  %tmp10 = load <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6)
+  %tmp12 = extractelement <4 x float> %tmp11, i32 0
+  %tmp13 = extractelement <4 x float> %tmp11, i32 1
+  call void @llvm.AMDGPU.barrier.global() #1
+  %tmp14 = extractelement <4 x float> %tmp11, i32 2
+;  %tmp15 = extractelement <4 x float> %tmp11, i32 3
+  %tmp15 = load float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
+  %tmp16 = getelementptr <16 x i8> addrspace(2)* %arg3, i32 1
+  %tmp17 = load <16 x i8> addrspace(2)* %tmp16, !tbaa !0
+  %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6)
+  %tmp19 = extractelement <4 x float> %tmp18, i32 0
+  %tmp20 = extractelement <4 x float> %tmp18, i32 1
+  %tmp21 = extractelement <4 x float> %tmp18, i32 2
+  %tmp22 = extractelement <4 x float> %tmp18, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15)
   ret void
 }
 
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.global() #1
+
 ; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
 attributes #0 = { "ShaderType"="1" }
-attributes #1 = { nounwind readnone }
+attributes #1 = { noduplicate nounwind }
+attributes #2 = { nounwind readnone }
 
-!0 = metadata !{metadata !"const", null, i32 1}
+!0 = metadata !{metadata !1, metadata !1, i64 0, i32 1}
+!1 = metadata !{metadata !"const", null}
diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll
index 90079b0..47f65f5 100644
--- a/test/CodeGen/R600/work-item-intrinsics.ll
+++ b/test/CodeGen/R600/work-item-intrinsics.ll
@@ -1,13 +1,14 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
-
-; R600-CHECK: @ngroups_x
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[0].X
-; SI-CHECK: @ngroups_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}ngroups_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].X
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
@@ -15,13 +16,13 @@ entry:
   ret void
 }
 
-; R600-CHECK: @ngroups_y
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[0].Y
-; SI-CHECK: @ngroups_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x1
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}ngroups_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].Y
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
@@ -29,13 +30,13 @@ entry:
   ret void
 }
 
-; R600-CHECK: @ngroups_z
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[0].Z
-; SI-CHECK: @ngroups_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x2
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}ngroups_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].Z
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
@@ -43,13 +44,13 @@ entry:
   ret void
 }
 
-; R600-CHECK: @global_size_x
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[0].W
-; SI-CHECK: @global_size_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x3
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}global_size_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].W
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
@@ -57,13 +58,13 @@ entry:
   ret void
 }
 
-; R600-CHECK: @global_size_y
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[1].X
-; SI-CHECK: @global_size_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x4
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}global_size_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].X
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
@@ -71,13 +72,13 @@ entry:
   ret void
 }
 
-; R600-CHECK: @global_size_z
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[1].Y
-; SI-CHECK: @global_size_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x5
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}global_size_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].Y
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
@@ -85,13 +86,13 @@ entry:
   ret void
 }
 
-; R600-CHECK: @local_size_x
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[1].Z
-; SI-CHECK: @local_size_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x6
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}local_size_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].Z
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @local_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
@@ -99,13 +100,13 @@ entry:
   ret void
 }
 
-; R600-CHECK: @local_size_y
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[1].W
-; SI-CHECK: @local_size_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x7
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}local_size_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].W
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @local_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
@@ -113,13 +114,13 @@ entry:
   ret void
 }
 
-; R600-CHECK: @local_size_z
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[2].X
-; SI-CHECK: @local_size_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x8
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}local_size_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].X
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @local_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
@@ -127,13 +128,27 @@ entry:
   ret void
 }
 
-; The tgid values are stored in ss offset by the number of user ss.
-; Currently we always use exactly 2 user ss for the pointer to the
+; FUNC-LABEL: {{^}}get_work_dim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].Z
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
+define void @get_work_dim (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; The tgid values are stored in sgprs offset by the number of user sgprs.
+; Currently we always use exactly 2 user sgprs for the pointer to the
 ; kernel arguments, but this may change in the future.
 
-; SI-CHECK: @tgid_x
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s2
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}tgid_x:
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
+; SI: buffer_store_dword [[VVAL]]
 define void @tgid_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
@@ -141,9 +156,9 @@ entry:
   ret void
 }
 
-; SI-CHECK: @tgid_y
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s3
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}tgid_y:
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
+; SI: buffer_store_dword [[VVAL]]
 define void @tgid_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
@@ -151,9 +166,9 @@ entry:
   ret void
 }
 
-; SI-CHECK: @tgid_z
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}tgid_z:
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
+; SI: buffer_store_dword [[VVAL]]
 define void @tgid_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
@@ -161,8 +176,8 @@ entry:
   ret void
 }
 
-; SI-CHECK: @tidig_x
-; SI-CHECK: BUFFER_STORE_DWORD v0
+; FUNC-LABEL: {{^}}tidig_x:
+; SI: buffer_store_dword v0
 define void @tidig_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
@@ -170,8 +185,8 @@ entry:
   ret void
 }
 
-; SI-CHECK: @tidig_y
-; SI-CHECK: BUFFER_STORE_DWORD v1
+; FUNC-LABEL: {{^}}tidig_y:
+; SI: buffer_store_dword v1
 define void @tidig_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
@@ -179,8 +194,8 @@ entry:
   ret void
 }
 
-; SI-CHECK: @tidig_z
-; SI-CHECK: BUFFER_STORE_DWORD v2
+; FUNC-LABEL: {{^}}tidig_z:
+; SI: buffer_store_dword v2
 define void @tidig_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
@@ -208,4 +223,6 @@ declare i32 @llvm.r600.read.tidig.x() #0
 declare i32 @llvm.r600.read.tidig.y() #0
 declare i32 @llvm.r600.read.tidig.z() #0
 
+declare i32 @llvm.AMDGPU.read.workdim() #0
+
 attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/wrong-transalu-pos-fix.ll b/test/CodeGen/R600/wrong-transalu-pos-fix.ll
index b1cbe3f..d652d2d 100644
--- a/test/CodeGen/R600/wrong-transalu-pos-fix.ll
+++ b/test/CodeGen/R600/wrong-transalu-pos-fix.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ; We want all MULLO_INT inst to be last in their instruction group
-;CHECK: @fill3d
+;CHECK: {{^}}fill3d:
 ;CHECK-NOT: MULLO_INT T[0-9]+
 
 ; ModuleID = 'radeon'
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index ab618cf..fa54e38 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK: @xor_v2i32
+;EG-CHECK: {{^}}xor_v2i32:
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @xor_v2i32
-;SI-CHECK: V_XOR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: {{^}}xor_v2i32:
+;SI-CHECK: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 
 define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
@@ -18,17 +18,17 @@ define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG-CHECK: @xor_v4i32
+;EG-CHECK: {{^}}xor_v4i32:
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @xor_v4i32
-;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: {{^}}xor_v4i32:
+;SI-CHECK: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
   %a = load <4 x i32> addrspace(1) * %in0
@@ -38,11 +38,11 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   ret void
 }
 
-;EG-CHECK: @xor_i1
+;EG-CHECK: {{^}}xor_i1:
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
 
-;SI-CHECK: @xor_i1
-;SI-CHECK: S_XOR_B64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+;SI-CHECK: {{^}}xor_i1:
+;SI-CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float addrspace(1) * %in0
@@ -55,8 +55,8 @@ define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float ad
   ret void
 }
 
-; SI-CHECK-LABEL: @vector_xor_i32
-; SI-CHECK: V_XOR_B32_e32
+; SI-CHECK-LABEL: {{^}}vector_xor_i32:
+; SI-CHECK: v_xor_b32_e32
 define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32 addrspace(1)* %in0
   %b = load i32 addrspace(1)* %in1
@@ -65,24 +65,24 @@ define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
   ret void
 }
 
-; SI-CHECK-LABEL: @scalar_xor_i32
-; SI-CHECK: S_XOR_B32
+; SI-CHECK-LABEL: {{^}}scalar_xor_i32:
+; SI-CHECK: s_xor_b32
 define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = xor i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @scalar_not_i32
-; SI-CHECK: S_NOT_B32
+; SI-CHECK-LABEL: {{^}}scalar_not_i32:
+; SI-CHECK: s_not_b32
 define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
   %result = xor i32 %a, -1
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @vector_not_i32
-; SI-CHECK: V_NOT_B32
+; SI-CHECK-LABEL: {{^}}vector_not_i32:
+; SI-CHECK: v_not_b32
 define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32 addrspace(1)* %in0
   %b = load i32 addrspace(1)* %in1
@@ -91,10 +91,10 @@ define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
   ret void
 }
 
-; SI-CHECK-LABEL: @vector_xor_i64
-; SI-CHECK: V_XOR_B32_e32
-; SI-CHECK: V_XOR_B32_e32
-; SI-CHECK: S_ENDPGM
+; SI-CHECK-LABEL: {{^}}vector_xor_i64:
+; SI-CHECK: v_xor_b32_e32
+; SI-CHECK: v_xor_b32_e32
+; SI-CHECK: s_endpgm
 define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64 addrspace(1)* %in0
   %b = load i64 addrspace(1)* %in1
@@ -103,26 +103,26 @@ define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64
   ret void
 }
 
-; SI-CHECK-LABEL: @scalar_xor_i64
-; SI-CHECK: S_XOR_B64
-; SI-CHECK: S_ENDPGM
+; SI-CHECK-LABEL: {{^}}scalar_xor_i64:
+; SI-CHECK: s_xor_b64
+; SI-CHECK: s_endpgm
 define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = xor i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @scalar_not_i64
-; SI-CHECK: S_NOT_B64
+; SI-CHECK-LABEL: {{^}}scalar_not_i64:
+; SI-CHECK: s_not_b64
 define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = xor i64 %a, -1
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @vector_not_i64
-; SI-CHECK: V_NOT_B32
-; SI-CHECK: V_NOT_B32
+; SI-CHECK-LABEL: {{^}}vector_not_i64:
+; SI-CHECK: v_not_b32
+; SI-CHECK: v_not_b32
 define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64 addrspace(1)* %in0
   %b = load i64 addrspace(1)* %in1
@@ -135,9 +135,8 @@ define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64
 ; Note that in the future the backend may be smart enough to
 ; use an SALU instruction for this.
 
-; SI-CHECK-LABEL: @xor_cf
-; SI-CHECK: V_XOR
-; SI-CHECK: V_XOR
+; SI-CHECK-LABEL: {{^}}xor_cf:
+; SI-CHECK: s_xor_b64
 define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
 entry:
   %0 = icmp eq i64 %a, 0
diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
index 8585d4a..0fe1f15 100644
--- a/test/CodeGen/R600/zero_extend.ll
+++ b/test/CodeGen/R600/zero_extend.ll
@@ -1,14 +1,14 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; R600-CHECK: @test
+; R600-CHECK: {{^}}test:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
 
-; SI-CHECK: @test
-; SI-CHECK: S_MOV_B32 [[ZERO:s[0-9]]], 0
-; SI-CHECK: V_MOV_B32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
-; SI-CHECK: BUFFER_STORE_DWORDX2 v[0:[[V_ZERO]]{{\]}}
+; SI-CHECK: {{^}}test:
+; SI-CHECK: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}}
+; SI-CHECK: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
+; SI-CHECK: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
 define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = mul i32 %a, %b
@@ -18,8 +18,8 @@ entry:
   ret void
 }
 
-; SI-CHECK-LABEL: @testi1toi32
-; SI-CHECK: V_CNDMASK_B32
+; SI-CHECK-LABEL: {{^}}testi1toi32:
+; SI-CHECK: v_cndmask_b32
 define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp eq i32 %a, %b
@@ -28,10 +28,10 @@ entry:
   ret void
 }
 
-; SI-CHECK-LABEL: @zext_i1_to_i64
-; SI-CHECK: V_CMP_EQ_I32
-; SI-CHECK: V_CNDMASK_B32
-; SI-CHECK: S_MOV_B32 s{{[0-9]+}}, 0
+; SI-CHECK-LABEL: {{^}}zext_i1_to_i64:
+; SI-CHECK: v_cmp_eq_i32
+; SI-CHECK: v_cndmask_b32
+; SI-CHECK: s_mov_b32 s{{[0-9]+}}, 0
 define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64
diff --git a/test/CodeGen/SPARC/empty-functions.ll b/test/CodeGen/SPARC/empty-functions.ll
new file mode 100644
index 0000000..38d2889
--- /dev/null
+++ b/test/CodeGen/SPARC/empty-functions.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=sparc-linux-gnu | FileCheck -check-prefix=LINUX-NO-FP %s
+; RUN: llc < %s -mtriple=sparc-linux-gnu -disable-fp-elim | FileCheck -check-prefix=LINUX-FP %s
+
+define void @func() {
+entry:
+  unreachable
+}
+
+; An empty function is perfectly fine on ELF.
+; LINUX-NO-FP: func:
+; LINUX-NO-FP-NEXT: .cfi_startproc
+; LINUX-NO-FP-NEXT: {{^}}!
+; LINUX-NO-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-NO-FP-NEXT: .size   func, .L{{.*}}-func
+; LINUX-NO-FP-NEXT: .cfi_endproc
+
+; A cfi directive can point to the end of a function. It (and in fact the
+; entire body) could be optimized out because of the unreachable, but we
+; don't do it right now.
+; LINUX-FP: func:
+; LINUX-FP-NEXT: .cfi_startproc
+; LINUX-FP-NEXT: {{^}}!
+; LINUX-FP-NEXT: save %sp, -96, %sp
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_def_cfa_register %fp
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_window_save
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_register 15, 31
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .size   func, .Ltmp3-func
+; LINUX-FP-NEXT: .cfi_endproc
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index ffc9584..d31a84b 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -25,7 +25,7 @@ define void @_Z19getClosestDiagonal3ii(%0* noalias sret, i32, i32) nounwind {
   %storemerge = phi double [ -1.000000e+00, %4 ], [ 1.000000e+00, %3 ], [ 1.000000e+00, %3 ] ; <double> [#uses=1]
   %v_6 = icmp slt i32 %1, 2                         ; <i1> [#uses=1]
   %storemerge1 = select i1 %v_6, double 1.000000e+00, double -1.000000e+00 ; <double> [#uses=3]
-  call void @llvm.dbg.value(metadata !{double %storemerge}, i64 0, metadata !91), !dbg !0
+  call void @llvm.dbg.value(metadata !{double %storemerge}, i64 0, metadata !91, metadata !{metadata !"0x102"}), !dbg !0
   %v_7 = icmp eq i32 %2, 1, !dbg !92                ; <i1> [#uses=1]
   %storemerge2 = select i1 %v_7, double 1.000000e+00, double -1.000000e+00 ; <double> [#uses=3]
   %v_8 = getelementptr inbounds %0* %0, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
@@ -40,116 +40,116 @@ define void @_Z19getClosestDiagonal3ii(%0* noalias sret, i32, i32) nounwind {
   ret void, !dbg !98
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare double @sqrt(double) nounwind readonly
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!5}
 !llvm.module.flags = !{!104}
 !0 = metadata !{i32 46, i32 0, metadata !1, null}
-!1 = metadata !{i32 524299, metadata !101, metadata !2, i32 44, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{i32 524299, metadata !101, metadata !3, i32 44, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{i32 524334, metadata !101, null, metadata !"getClosestDiagonal3", metadata !"getClosestDiagonal3", metadata !"_Z19getClosestDiagonal3ii", i32 44, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!4 = metadata !{i32 524329, metadata !101} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 524305, metadata !101, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", i1 true, metadata !"", i32 0, metadata !102, metadata !102, metadata !103, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!1 = metadata !{metadata !"0xb\0044\000\000", metadata !101, metadata !2} ; [ DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0xb\0044\000\000", metadata !101, metadata !3} ; [ DW_TAG_lexical_block ]
+!3 = metadata !{metadata !"0x2e\00getClosestDiagonal3\00getClosestDiagonal3\00_Z19getClosestDiagonal3ii\0044\000\001\000\006\000\000\000", metadata !101, null, metadata !6, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!4 = metadata !{metadata !"0x29", metadata !101} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)\001\00\000\00\000", metadata !101, metadata !102, metadata !102, metadata !103, null, null} ; [ DW_TAG_compile_unit ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !22, metadata !22}
-!8 = metadata !{i32 524307, metadata !99, null, metadata !"ggVector3", i32 66, i64 192, i64 32, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [ggVector3] [line 66, size 192, align 32, offset 0] [def] [from ]
-!9 = metadata !{i32 524329, metadata !"ggVector3.h", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src", metadata !5} ; [ DW_TAG_file_type ]
+!8 = metadata !{metadata !"0x13\00ggVector3\0066\00192\0032\000\000\000", metadata !99, null, null, metadata !10, null, null, null} ; [ DW_TAG_structure_type ] [ggVector3] [line 66, size 192, align 32, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0x29", metadata !"ggVector3.h", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src", metadata !5} ; [ DW_TAG_file_type ]
 !99 = metadata !{metadata !"ggVector3.h", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
 !10 = metadata !{metadata !11, metadata !16, metadata !23, metadata !26, metadata !29, metadata !30, metadata !35, metadata !36, metadata !37, metadata !41, metadata !42, metadata !43, metadata !46, metadata !47, metadata !48, metadata !52, metadata !53, metadata !54, metadata !57, metadata !60, metadata !63, metadata !66, metadata !70, metadata !71, metadata !74, metadata !75, metadata !76, metadata !77, metadata !78, metadata !81, metadata !82, metadata !83, metadata !84, metadata !85, metadata !88, metadata !89, metadata !90}
-!11 = metadata !{i32 524301, metadata !99, metadata !8, metadata !"e", i32 160, i64 192, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ]
-!12 = metadata !{i32 524289, metadata !101, metadata !4, metadata !"", i32 0, i64 192, i64 32, i64 0, i32 0, metadata !13, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 192, align 32, offset 0] [from double]
-!13 = metadata !{i32 524324, metadata !101, metadata !4, metadata !"double", i32 0, i64 64, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !"0xd\00e\00160\00192\0032\000\000", metadata !99, metadata !8, metadata !12} ; [ DW_TAG_member ]
+!12 = metadata !{metadata !"0x1\00\000\00192\0032\000\000", metadata !101, metadata !4, metadata !13, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 192, align 32, offset 0] [from double]
+!13 = metadata !{metadata !"0x24\00double\000\0064\0032\000\000\004", metadata !101, metadata !4} ; [ DW_TAG_base_type ]
 !14 = metadata !{metadata !15}
-!15 = metadata !{i32 524321, i64 0, i64 3}        ; [ DW_TAG_subrange_type ]
-!16 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 72, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0x21\000\003"}        ; [ DW_TAG_subrange_type ]
+!16 = metadata !{metadata !"0x2e\00ggVector3\00ggVector3\00\0072\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !17, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{null, metadata !19, metadata !20}
-!19 = metadata !{i32 524303, metadata !101, metadata !4, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 64, metadata !8} ; [ DW_TAG_pointer_type ]
-!20 = metadata !{i32 524310, metadata !100, null, metadata !"ggBoolean", i32 478, i64 0, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_typedef ]
-!21 = metadata !{i32 524329, metadata !"math.h", metadata !"/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.Internal.sdk/usr/include/architecture/arm", metadata !5} ; [ DW_TAG_file_type ]
+!19 = metadata !{metadata !"0xf\00\000\0032\0032\000\0064", metadata !101, metadata !4, metadata !8} ; [ DW_TAG_pointer_type ]
+!20 = metadata !{metadata !"0x16\00ggBoolean\00478\000\000\000\000", metadata !100, null, metadata !22} ; [ DW_TAG_typedef ]
+!21 = metadata !{metadata !"0x29", metadata !"math.h", metadata !"/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.Internal.sdk/usr/include/architecture/arm", metadata !5} ; [ DW_TAG_file_type ]
 !100 = metadata !{metadata !"math.h", metadata !"/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.Internal.sdk/usr/include/architecture/arm"}
-!22 = metadata !{i32 524324, metadata !101, metadata !4, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!23 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 73, metadata !24, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!24 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !101, metadata !4} ; [ DW_TAG_base_type ]
+!23 = metadata !{metadata !"0x2e\00ggVector3\00ggVector3\00\0073\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !24, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !25 = metadata !{null, metadata !19}
-!26 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 74, metadata !27, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!27 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!26 = metadata !{metadata !"0x2e\00ggVector3\00ggVector3\00\0074\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !27, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!27 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !28, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !28 = metadata !{null, metadata !19, metadata !13, metadata !13, metadata !13}
-!29 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"Set", metadata !"Set", metadata !"_ZN9ggVector33SetEddd", i32 81, metadata !27, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!30 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"x", metadata !"x", metadata !"_ZNK9ggVector31xEv", i32 82, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!31 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !32, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!29 = metadata !{metadata !"0x2e\00Set\00Set\00_ZN9ggVector33SetEddd\0081\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !27, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!30 = metadata !{metadata !"0x2e\00x\00x\00_ZNK9ggVector31xEv\0082\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!31 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !32, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !32 = metadata !{metadata !13, metadata !33}
-!33 = metadata !{i32 524303, metadata !101, metadata !4, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 64, metadata !34} ; [ DW_TAG_pointer_type ]
-!34 = metadata !{i32 524326, metadata !101, metadata !4, metadata !"", i32 0, i64 192, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ]
-!35 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"y", metadata !"y", metadata !"_ZNK9ggVector31yEv", i32 83, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!36 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"z", metadata !"z", metadata !"_ZNK9ggVector31zEv", i32 84, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!37 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"x", metadata !"x", metadata !"_ZN9ggVector31xEv", i32 85, metadata !38, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!38 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !39, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!33 = metadata !{metadata !"0xf\00\000\0032\0032\000\0064", metadata !101, metadata !4, metadata !34} ; [ DW_TAG_pointer_type ]
+!34 = metadata !{metadata !"0x26\00\000\00192\0032\000\000", metadata !101, metadata !4, metadata !8} ; [ DW_TAG_const_type ]
+!35 = metadata !{metadata !"0x2e\00y\00y\00_ZNK9ggVector31yEv\0083\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!36 = metadata !{metadata !"0x2e\00z\00z\00_ZNK9ggVector31zEv\0084\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!37 = metadata !{metadata !"0x2e\00x\00x\00_ZN9ggVector31xEv\0085\000\001\000\006\000\000\000", metadata !9, metadata !8, metadata !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!38 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !39, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !39 = metadata !{metadata !40, metadata !19}
-!40 = metadata !{i32 524304, metadata !101, metadata !4, metadata !"double", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !13} ; [ DW_TAG_reference_type ]
-!41 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"y", metadata !"y", metadata !"_ZN9ggVector31yEv", i32 86, metadata !38, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!42 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"z", metadata !"z", metadata !"_ZN9ggVector31zEv", i32 87, metadata !38, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!43 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"SetX", metadata !"SetX", metadata !"_ZN9ggVector34SetXEd", i32 88, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!44 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!40 = metadata !{metadata !"0x10\00double\000\0032\0032\000\000", metadata !101, metadata !4, metadata !13} ; [ DW_TAG_reference_type ]
+!41 = metadata !{metadata !"0x2e\00y\00y\00_ZN9ggVector31yEv\0086\000\001\000\006\000\000\000", metadata !9, metadata !8, metadata !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!42 = metadata !{metadata !"0x2e\00z\00z\00_ZN9ggVector31zEv\0087\000\001\000\006\000\000\000", metadata !9, metadata !8, metadata !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!43 = metadata !{metadata !"0x2e\00SetX\00SetX\00_ZN9ggVector34SetXEd\0088\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !44, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!44 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !45, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !45 = metadata !{null, metadata !19, metadata !13}
-!46 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"SetY", metadata !"SetY", metadata !"_ZN9ggVector34SetYEd", i32 89, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!47 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"SetZ", metadata !"SetZ", metadata !"_ZN9ggVector34SetZEd", i32 90, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!48 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 92, metadata !49, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!49 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !50, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!46 = metadata !{metadata !"0x2e\00SetY\00SetY\00_ZN9ggVector34SetYEd\0089\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !44, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!47 = metadata !{metadata !"0x2e\00SetZ\00SetZ\00_ZN9ggVector34SetZEd\0090\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !44, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!48 = metadata !{metadata !"0x2e\00ggVector3\00ggVector3\00\0092\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !49, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!49 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !50, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !50 = metadata !{null, metadata !19, metadata !51}
-!51 = metadata !{i32 524304, metadata !101, metadata !4, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !34} ; [ DW_TAG_reference_type ]
-!52 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"tolerance", metadata !"tolerance", metadata !"_ZNK9ggVector39toleranceEv", i32 100, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!53 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"tolerance", metadata !"tolerance", metadata !"_ZN9ggVector39toleranceEv", i32 101, metadata !38, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!54 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator+", metadata !"operator+", metadata !"_ZNK9ggVector3psEv", i32 107, metadata !55, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!55 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !56, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!51 = metadata !{metadata !"0x10\00\000\0032\0032\000\000", metadata !101, metadata !4, metadata !34} ; [ DW_TAG_reference_type ]
+!52 = metadata !{metadata !"0x2e\00tolerance\00tolerance\00_ZNK9ggVector39toleranceEv\00100\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!53 = metadata !{metadata !"0x2e\00tolerance\00tolerance\00_ZN9ggVector39toleranceEv\00101\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!54 = metadata !{metadata !"0x2e\00operator+\00operator+\00_ZNK9ggVector3psEv\00107\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !55, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!55 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !56, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !56 = metadata !{metadata !51, metadata !33}
-!57 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator-", metadata !"operator-", metadata !"_ZNK9ggVector3ngEv", i32 108, metadata !58, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!58 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !59, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!57 = metadata !{metadata !"0x2e\00operator-\00operator-\00_ZNK9ggVector3ngEv\00108\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !58, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!58 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !59, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !59 = metadata !{metadata !8, metadata !33}
-!60 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator[]", metadata !"operator[]", metadata !"_ZNK9ggVector3ixEi", i32 290, metadata !61, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!61 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !62, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!60 = metadata !{metadata !"0x2e\00operator[]\00operator[]\00_ZNK9ggVector3ixEi\00290\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !61, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!61 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !62, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !62 = metadata !{metadata !13, metadata !33, metadata !22}
-!63 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator[]", metadata !"operator[]", metadata !"_ZN9ggVector3ixEi", i32 278, metadata !64, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!64 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !65, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!63 = metadata !{metadata !"0x2e\00operator[]\00operator[]\00_ZN9ggVector3ixEi\00278\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !64, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!64 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !65, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !65 = metadata !{metadata !40, metadata !19, metadata !22}
-!66 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator+=", metadata !"operator+=", metadata !"_ZN9ggVector3pLERKS_", i32 303, metadata !67, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!67 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !68, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!66 = metadata !{metadata !"0x2e\00operator+=\00operator+=\00_ZN9ggVector3pLERKS_\00303\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !67, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!67 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !68, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !68 = metadata !{metadata !69, metadata !19, metadata !51}
-!69 = metadata !{i32 524304, metadata !101, metadata !4, metadata !"ggVector3", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_reference_type ]
-!70 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator-=", metadata !"operator-=", metadata !"_ZN9ggVector3mIERKS_", i32 310, metadata !67, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!71 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator*=", metadata !"operator*=", metadata !"_ZN9ggVector3mLEd", i32 317, metadata !72, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!72 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !73, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!69 = metadata !{metadata !"0x10\00ggVector3\000\0032\0032\000\000", metadata !101, metadata !4, metadata !8} ; [ DW_TAG_reference_type ]
+!70 = metadata !{metadata !"0x2e\00operator-=\00operator-=\00_ZN9ggVector3mIERKS_\00310\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !67, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!71 = metadata !{metadata !"0x2e\00operator*=\00operator*=\00_ZN9ggVector3mLEd\00317\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !72, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!72 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !73, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !73 = metadata !{metadata !69, metadata !19, metadata !13}
-!74 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator/=", metadata !"operator/=", metadata !"_ZN9ggVector3dVEd", i32 324, metadata !72, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!75 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"length", metadata !"length", metadata !"_ZNK9ggVector36lengthEv", i32 121, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!76 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"squaredLength", metadata !"squaredLength", metadata !"_ZNK9ggVector313squaredLengthEv", i32 122, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!77 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"MakeUnitVector", metadata !"MakeUnitVector", metadata !"_ZN9ggVector314MakeUnitVectorEv", i32 217, metadata !24, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!78 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"Perturb", metadata !"Perturb", metadata !"_ZNK9ggVector37PerturbEdd", i32 126, metadata !79, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!79 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !80, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!74 = metadata !{metadata !"0x2e\00operator/=\00operator/=\00_ZN9ggVector3dVEd\00324\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !72, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!75 = metadata !{metadata !"0x2e\00length\00length\00_ZNK9ggVector36lengthEv\00121\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!76 = metadata !{metadata !"0x2e\00squaredLength\00squaredLength\00_ZNK9ggVector313squaredLengthEv\00122\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!77 = metadata !{metadata !"0x2e\00MakeUnitVector\00MakeUnitVector\00_ZN9ggVector314MakeUnitVectorEv\00217\000\001\000\006\000\000\000", metadata !9, metadata !8, metadata !24, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!78 = metadata !{metadata !"0x2e\00Perturb\00Perturb\00_ZNK9ggVector37PerturbEdd\00126\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !79, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!79 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !80, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !80 = metadata !{metadata !8, metadata !33, metadata !13, metadata !13}
-!81 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"maxComponent", metadata !"maxComponent", metadata !"_ZNK9ggVector312maxComponentEv", i32 128, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!82 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"minComponent", metadata !"minComponent", metadata !"_ZNK9ggVector312minComponentEv", i32 129, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!83 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"maxAbsComponent", metadata !"maxAbsComponent", metadata !"_ZNK9ggVector315maxAbsComponentEv", i32 131, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!84 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"minAbsComponent", metadata !"minAbsComponent", metadata !"_ZNK9ggVector315minAbsComponentEv", i32 132, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!85 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMinComponent", metadata !"indexOfMinComponent", metadata !"_ZNK9ggVector319indexOfMinComponentEv", i32 133, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!86 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !87, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!81 = metadata !{metadata !"0x2e\00maxComponent\00maxComponent\00_ZNK9ggVector312maxComponentEv\00128\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!82 = metadata !{metadata !"0x2e\00minComponent\00minComponent\00_ZNK9ggVector312minComponentEv\00129\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!83 = metadata !{metadata !"0x2e\00maxAbsComponent\00maxAbsComponent\00_ZNK9ggVector315maxAbsComponentEv\00131\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!84 = metadata !{metadata !"0x2e\00minAbsComponent\00minAbsComponent\00_ZNK9ggVector315minAbsComponentEv\00132\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!85 = metadata !{metadata !"0x2e\00indexOfMinComponent\00indexOfMinComponent\00_ZNK9ggVector319indexOfMinComponentEv\00133\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!86 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !87, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !87 = metadata !{metadata !22, metadata !33}
-!88 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMinAbsComponent", metadata !"indexOfMinAbsComponent", metadata !"_ZNK9ggVector322indexOfMinAbsComponentEv", i32 137, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!89 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMaxComponent", metadata !"indexOfMaxComponent", metadata !"_ZNK9ggVector319indexOfMaxComponentEv", i32 146, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!90 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMaxAbsComponent", metadata !"indexOfMaxAbsComponent", metadata !"_ZNK9ggVector322indexOfMaxAbsComponentEv", i32 150, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!91 = metadata !{i32 524544, metadata !1, metadata !"vx", metadata !4, i32 46, metadata !13} ; [ DW_TAG_auto_variable ]
+!88 = metadata !{metadata !"0x2e\00indexOfMinAbsComponent\00indexOfMinAbsComponent\00_ZNK9ggVector322indexOfMinAbsComponentEv\00137\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!89 = metadata !{metadata !"0x2e\00indexOfMaxComponent\00indexOfMaxComponent\00_ZNK9ggVector319indexOfMaxComponentEv\00146\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!90 = metadata !{metadata !"0x2e\00indexOfMaxAbsComponent\00indexOfMaxAbsComponent\00_ZNK9ggVector322indexOfMaxAbsComponentEv\00150\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!91 = metadata !{metadata !"0x100\00vx\0046\000", metadata !1, metadata !4, metadata !13} ; [ DW_TAG_auto_variable ]
 !92 = metadata !{i32 48, i32 0, metadata !1, null}
 !93 = metadata !{i32 218, i32 0, metadata !94, metadata !96}
-!94 = metadata !{i32 524299, metadata !101, metadata !95, i32 217, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!95 = metadata !{i32 524299, metadata !101, metadata !77, i32 217, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!94 = metadata !{metadata !"0xb\00217\000\000", metadata !101, metadata !95} ; [ DW_TAG_lexical_block ]
+!95 = metadata !{metadata !"0xb\00217\000\000", metadata !101, metadata !77} ; [ DW_TAG_lexical_block ]
 !96 = metadata !{i32 51, i32 0, metadata !1, null}
 !97 = metadata !{i32 227, i32 0, metadata !94, metadata !96}
 !98 = metadata !{i32 52, i32 0, metadata !1, null}
 !101 = metadata !{metadata !"ggEdgeDiscrepancy.cc", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
 !102 = metadata !{i32 0}
 !103 = metadata !{metadata !3, metadata !77}
-!104 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!104 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/Thumb/2012-04-26-M0ISelBug.ll b/test/CodeGen/Thumb/2012-04-26-M0ISelBug.ll
index b39978b..369ac96 100644
--- a/test/CodeGen/Thumb/2012-04-26-M0ISelBug.ll
+++ b/test/CodeGen/Thumb/2012-04-26-M0ISelBug.ll
@@ -5,7 +5,7 @@
 define i32 @t(i32 %a) nounwind {
 ; CHECK-LABEL: t:
 ; CHECK: asrs [[REG1:(r[0-9]+)]], [[REG2:(r[0-9]+)]], #31
-; CHECK: eors [[REG1]], [[REG2]]
+; CHECK: eors [[REG2]], [[REG1]]
   %tmp0 = ashr i32 %a, 31
   %tmp1 = xor i32 %tmp0, %a
   ret i32 %tmp1
diff --git a/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll b/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
index ae66369..cfa1159 100644
--- a/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
+++ b/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
@@ -1,12 +1,11 @@
-; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
-; XFAIL: *
+; RUN: llc < %s -mtriple=thumbv6m-eabi -verify-machineinstrs -o - | FileCheck %s
 
 define void @foo(i32* %A) #0 {
 entry:
 ; CHECK-LABEL: foo:
 ; CHECK: push {r7, lr}
-; CHECK: ldm [[REG0:r[0-9]]]!,
-; CHECK-NEXT: subs [[REG0]]
+; CHECK: ldm
+; CHECK-NEXT: subs
 ; CHECK-NEXT: bl
   %0 = load i32* %A, align 4
   %arrayidx1 = getelementptr inbounds i32* %A, i32 1
diff --git a/test/CodeGen/Thumb/copy_thumb.ll b/test/CodeGen/Thumb/copy_thumb.ll
new file mode 100644
index 0000000..528f54b
--- /dev/null
+++ b/test/CodeGen/Thumb/copy_thumb.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=armv4-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; RUN: llc -mtriple=armv4t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; RUN: llc -mtriple=armv5-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; RUN: llc -mtriple=armv6-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; RUN: llc -mtriple=armv7-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; RUN: llc -mtriple=thumbv6-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; RUN: llc -mtriple=thumbv7-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; CHECK-LOLOMOV-LABEL:  foo
+; CHECK-LOLOMOV:        mov [[TMP:r[0-7]]], [[SRC1:r[01]]]
+; CHECK-LOLOMOV-NEXT:   mov [[SRC1]], [[SRC2:r[01]]]
+; CHECK-LOLOMOV-NEXT:   mov [[SRC2]], [[TMP]]
+; CHECK-LOLOMOV-LABEL:  bar
+; CHECK-LOLOMOV-LABEL:  fnend
+; 
+; 'MOV lo, lo' in Thumb mode produces undefined results on pre-v6 hardware
+; RUN: llc -mtriple=thumbv4t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-NOLOLOMOV
+; RUN: llc -mtriple=thumbv5-none--eabi < %s | FileCheck %s --check-prefix=CHECK-NOLOLOMOV
+; CHECK-NOLOLOMOV-LABEL: foo
+; CHECK-NOLOLOMOV-NOT:   mov [[TMP:r[0-7]]], [[SRC1:r[01]]]
+; CHECK-NOLOLOMOV:       push  {[[SRC1:r[01]]]}
+; CHECK-NOLOLOMOV-NEXT:  pop {[[TMP:r[0-7]]]}
+; CHECK-NOLOLOMOV-NOT:   mov [[TMP:r[0-7]]], [[SRC1:r[01]]]
+; CHECK-NOLOLOMOV:       push  {[[SRC2:r[01]]]}
+; CHECK-NOLOLOMOV-NEXT:  pop {[[SRC1]]}
+; CHECK-NOLOLOMOV-NOT:   mov [[TMP:r[0-7]]], [[SRC1:r[01]]]
+; CHECK-NOLOLOMOV:       push  {[[TMP]]}
+; CHECK-NOLOLOMOV-NEXT:  pop {[[SRC2]]}
+; CHECK-NOLOLOMOV-LABEL: bar
+; CHECK-NOLOLOMOV-LABEL: fnend
+
+declare void @bar(i32, i32)
+
+define void @foo(i32 %a, i32 %b) {
+entry:
+  call void @bar(i32 %b, i32 %a);
+  ret void
+}
+
diff --git a/test/CodeGen/Thumb/dyn-stackalloc.ll b/test/CodeGen/Thumb/dyn-stackalloc.ll
index 6c6de55..309d802 100644
--- a/test/CodeGen/Thumb/dyn-stackalloc.ll
+++ b/test/CodeGen/Thumb/dyn-stackalloc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra | FileCheck %s
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=RA_GREEDY
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=RA_BASIC
 
 	%struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* }
 	%struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* }
@@ -45,7 +45,8 @@ define void @t2(%struct.comment* %vc, i8* %tag, i8* %contents) {
 ; CHECK: sub sp, #
 ; CHECK: mov r[[R0:[0-9]+]], sp
 ; CHECK: str r{{[0-9+]}}, [r[[R0]]
-; CHECK: str r{{[0-9+]}}, [r[[R0]]
+; RA_GREEDY: str r{{[0-9+]}}, [r[[R0]]
+; RA_BASIC: stm r[[R0]]!
 ; CHECK-NOT: ldr r0, [sp
 ; CHECK: mov r[[R1:[0-9]+]], sp
 ; CHECK: subs r[[R2:[0-9]+]], r[[R1]], r{{[0-9]+}}
diff --git a/test/CodeGen/Thumb/inlineasm-thumb.ll b/test/CodeGen/Thumb/inlineasm-thumb.ll
index 2547ce8..cfaf2ba 100644
--- a/test/CodeGen/Thumb/inlineasm-thumb.ll
+++ b/test/CodeGen/Thumb/inlineasm-thumb.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -no-integrated-as %s -o - | FileCheck %s
 
 define i32 @t1(i32 %x, i32 %y) nounwind {
 entry:
@@ -6,3 +6,14 @@ entry:
   %0 = tail call i32 asm "mov $0, $1", "=l,h"(i32 %y) nounwind
   ret i32 %0
 }
+
+; CHECK-LABEL: constraint_r:
+; CHECK: foo2 r{{[0-7]+}}, r{{[0-7]+}}
+
+define i32 @constraint_r() {
+entry:
+  %0 = tail call i32 asm sideeffect "movs $0, #1", "=r"()
+  tail call void asm sideeffect "foo1", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7}"()
+  %1 = tail call i32 asm sideeffect "foo2 $0, $1", "=r,r"(i32 %0)
+  ret i32 %1
+}
diff --git a/test/CodeGen/Thumb/large-stack.ll b/test/CodeGen/Thumb/large-stack.ll
index fb6daa4..269bdd9 100644
--- a/test/CodeGen/Thumb/large-stack.ll
+++ b/test/CodeGen/Thumb/large-stack.ll
@@ -1,31 +1,57 @@
-; RUN: llc < %s -mtriple=thumb-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-apple-ios | FileCheck %s --check-prefix=CHECK --check-prefix=IOS
+; RUN: llc < %s -mtriple=thumb-none-eabi | FileCheck %s --check-prefix=CHECK --check-prefix=EABI
+; RUN: llc < %s -o %t -filetype=obj -mtriple=thumbv6-apple-ios
+; RUN: llvm-objdump -triple=thumbv6-apple-ios -d %t | FileCheck %s --check-prefix=CHECK --check-prefix=IOS
+; RUN: llc < %s -o %t -filetype=obj -mtriple=thumbv6-none-eabi
+; RUN: llvm-objdump -triple=thumbv6-none-eabi -d %t | FileCheck %s --check-prefix=CHECK --check-prefix=EABI
 
+; Largest stack for which a single tADDspi/tSUBspi is enough
 define void @test1() {
 ; CHECK-LABEL: test1:
-; CHECK: sub sp, #256
-; CHECK: add sp, #256
-    %tmp = alloca [ 64 x i32 ] , align 4
+; CHECK: sub sp, #508
+; CHECK: add sp, #508
+    %tmp = alloca [ 508 x i8 ] , align 4
     ret void
 }
 
+; Largest stack for which three tADDspi/tSUBspis are enough
+define void @test100() {
+; CHECK-LABEL: test100:
+; CHECK: sub sp, #508
+; CHECK: sub sp, #508
+; CHECK: sub sp, #508
+; EABI: add sp, #508
+; EABI: add sp, #508
+; EABI: add sp, #508
+; IOS: subs r4, r7, #4
+; IOS: mov sp, r4
+    %tmp = alloca [ 1524 x i8 ] , align 4
+    ret void
+}
+
+; Smallest stack for which we use a constant pool
 define void @test2() {
 ; CHECK-LABEL: test2:
-; CHECK: ldr r0, LCPI
+; CHECK: ldr r0,
 ; CHECK: add sp, r0
-; CHECK: subs r4, r7, #4
-; CHECK: mov sp, r4
-    %tmp = alloca [ 4168 x i8 ] , align 4
+; EABI: ldr r0,
+; EABI: add sp, r0
+; IOS: subs r4, r7, #4
+; IOS: mov sp, r4
+    %tmp = alloca [ 1528 x i8 ] , align 4
     ret void
 }
 
 define i32 @test3() {
 ; CHECK-LABEL: test3:
-; CHECK: ldr r1, LCPI
+; CHECK: ldr r1,
 ; CHECK: add sp, r1
-; CHECK: ldr r1, LCPI
+; CHECK: ldr r1,
 ; CHECK: add r1, sp
-; CHECK: subs r4, r7, #4
-; CHECK: mov sp, r4
+; EABI: ldr r1,
+; EABI: add sp, r1
+; IOS: subs r4, r7, #4
+; IOS: mov sp, r4
     %retval = alloca i32, align 4
     %tmp = alloca i32, align 4
     %a = alloca [805306369 x i8], align 16
@@ -33,3 +59,22 @@ define i32 @test3() {
     %tmp1 = load i32* %tmp
     ret i32 %tmp1
 }
+
+; Here, the adds get optimized out because they are dead, but the calculation
+; of the address of stack_a is dead but not optimized out. When the address
+; calculation gets expanded to two instructions, we need to avoid reading a
+; dead register.
+; No CHECK lines (just test for crashes), as we hope this will be optimised
+; better in future.
+define i32 @test4() {
+entry:
+  %stack_a = alloca i8, align 1
+  %stack_b = alloca [256 x i32*], align 4
+  %int = ptrtoint i8* %stack_a to i32
+  %add = add i32 %int, 1
+  br label %block2
+
+block2:
+  %add2 = add i32 %add, 1
+  ret i32 0
+}
diff --git a/test/CodeGen/Thumb/ldm-merge-call.ll b/test/CodeGen/Thumb/ldm-merge-call.ll
new file mode 100644
index 0000000..febc96b
--- /dev/null
+++ b/test/CodeGen/Thumb/ldm-merge-call.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m--linux-gnueabi"
+
+; Function Attrs: nounwind optsize
+define void @foo(i32* nocapture readonly %A) #0 {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: ldm r[[BASE:[0-9]]]!,
+; CHECK-NEXT: mov r[[BASE]],
+  %0 = load i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i32 1
+  %1 = load i32* %arrayidx1, align 4
+  %call = tail call i32 @bar(i32 %0, i32 %1, i32 %0, i32 %1) #2
+  %call2 = tail call i32 @bar(i32 %0, i32 %1, i32 %0, i32 %1) #2
+  ret void
+}
+
+; Function Attrs: optsize
+declare i32 @bar(i32, i32, i32, i32) #1
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind optsize }
diff --git a/test/CodeGen/Thumb/ldm-merge-struct.ll b/test/CodeGen/Thumb/ldm-merge-struct.ll
new file mode 100644
index 0000000..2f732e0
--- /dev/null
+++ b/test/CodeGen/Thumb/ldm-merge-struct.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-none--eabi"
+
+%struct.S = type { i32, i32 }
+
+@s = common global %struct.S zeroinitializer, align 4
+
+define i32 @f() {
+entry:
+; CHECK-LABEL: f:
+; CHECK: ldm r[[BASE:[0-9]]],
+; CHECK-NEXT-NOT: subs r[[BASE]]
+  %0 = load i32* getelementptr inbounds (%struct.S* @s, i32 0, i32 0), align 4
+  %1 = load i32* getelementptr inbounds (%struct.S* @s, i32 0, i32 1), align 4
+  %cmp = icmp sgt i32 %0, %1
+  %2 = sub i32 0, %1
+  %cond.p = select i1 %cmp, i32 %1, i32 %2
+  %cond = add i32 %cond.p, %0
+  ret i32 %cond
+}
diff --git a/test/CodeGen/Thumb/ldm-stm-base-materialization.ll b/test/CodeGen/Thumb/ldm-stm-base-materialization.ll
new file mode 100644
index 0000000..6382c25
--- /dev/null
+++ b/test/CodeGen/Thumb/ldm-stm-base-materialization.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=thumbv6m-eabi -verify-machineinstrs -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-none--eabi"
+
+@a = external global i32*
+@b = external global i32*
+
+; Function Attrs: nounwind
+define void @foo() #0 {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: ldr r[[SB:[0-9]]], .LCPI
+; CHECK: ldr r[[LB:[0-9]]], .LCPI
+; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
+; CHECK-NEXT: ldm r[[NLB]],
+; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
+; CHECK-NEXT: stm r[[NSB]]
+  %0 = load i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 24, i32 4, i1 false)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
diff --git a/test/CodeGen/Thumb/pop.ll b/test/CodeGen/Thumb/pop.ll
index 1e45c7f..3c539c6 100644
--- a/test/CodeGen/Thumb/pop.ll
+++ b/test/CodeGen/Thumb/pop.ll
@@ -7,7 +7,9 @@ define void @t(i8* %a, ...) nounwind {
 ; CHECK-NEXT: add sp, #12
 ; CHECK-NEXT: bx r3
 entry:
-  %a.addr = alloca i8*
-  store i8* %a, i8** %a.addr
+  %a.addr = alloca i8, i32 4
+  call void @llvm.va_start(i8* %a.addr)
   ret void
 }
+
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/Thumb/stack_guard_remat.ll b/test/CodeGen/Thumb/stack_guard_remat.ll
new file mode 100644
index 0000000..e949cc1
--- /dev/null
+++ b/test/CodeGen/Thumb/stack_guard_remat.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=NO-PIC  -check-prefix=STATIC
+; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s  -check-prefix=NO-PIC -check-prefix=DYNAMIC-NO-PIC
+
+;PIC:   foo2
+;PIC:   ldr [[R0:r[0-9]+]], [[LABEL0:LCPI[0-9_]+]]
+;PIC: [[LABEL1:LPC[0-9_]+]]:
+;PIC:   add [[R0]], pc
+;PIC:   ldr [[R1:r[0-9]+]], {{\[}}[[R0]]{{\]}}
+;PIC:   ldr [[R1:r[0-9]+]], {{\[}}[[R1]]{{\]}}
+
+;PIC:      [[LABEL0]]:
+;PIC-NEXT:   .long L___stack_chk_guard$non_lazy_ptr-([[LABEL1]]+4)
+
+;NO-PIC:   foo2
+;NO-PIC:   ldr [[R0:r[0-9]+]], [[LABEL0:LCPI[0-9_]+]]
+;NO-PIC-NOT: LPC
+;NO-PIC:   ldr {{r[0-9]+}}, {{\[}}[[R0]]{{\]}}
+
+;STATIC:      [[LABEL0]]:
+;STATIC-NEXT:   .long ___stack_chk_guard
+
+;DYNAMIC-NO-PIC:      [[LABEL0]]:
+;DYNAMIC-NO-PIC-NEXT:   .long L___stack_chk_guard$non_lazy_ptr
+
+; Function Attrs: nounwind ssp
+define i32 @test_stack_guard_remat() #0 {
+  %a1 = alloca [256 x i32], align 4
+  %1 = bitcast [256 x i32]* %a1 to i8*
+  call void @llvm.lifetime.start(i64 1024, i8* %1)
+  %2 = getelementptr inbounds [256 x i32]* %a1, i32 0, i32 0
+  call void @foo3(i32* %2) #3
+  call void asm sideeffect "foo2", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{sp},~{lr}"()
+  call void @llvm.lifetime.end(i64 1024, i8* %1)
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Thumb/stm-merge.ll b/test/CodeGen/Thumb/stm-merge.ll
new file mode 100644
index 0000000..76e71f4
--- /dev/null
+++ b/test/CodeGen/Thumb/stm-merge.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m--linux-gnueabi"
+
+@d = internal unnamed_addr global i32 0, align 4
+@c = internal global i32* null, align 4
+@e = internal unnamed_addr global i32* null, align 4
+
+; Function Attrs: nounwind optsize
+define void @fn1() #0 {
+entry:
+; CHECK-LABEL: fn1:
+; CHECK: stm r[[BASE:[0-9]]]!, {{.*}}
+; CHECK-NOT: {{.*}} r[[BASE]]
+; CHECK: ldr r[[BASE]], {{.*}}
+  %g = alloca i32, align 4
+  %h = alloca i32, align 4
+  store i32 1, i32* %g, align 4
+  store i32 0, i32* %h, align 4
+  %.pr = load i32* @d, align 4
+  %cmp11 = icmp slt i32 %.pr, 1
+  br i1 %cmp11, label %for.inc.lr.ph, label %for.body5
+
+for.inc.lr.ph:                                    ; preds = %entry
+  store i32 1, i32* @d, align 4
+  br label %for.body5
+
+for.body5:                                        ; preds = %entry, %for.inc.lr.ph, %for.body5
+  %f.010 = phi i32 [ %inc7, %for.body5 ], [ 0, %for.inc.lr.ph ], [ 0, %entry ]
+  store volatile i32* %g, i32** @c, align 4
+  %inc7 = add nsw i32 %f.010, 1
+  %exitcond = icmp eq i32 %inc7, 2
+  br i1 %exitcond, label %for.end8, label %for.body5
+
+for.end8:                                         ; preds = %for.body5
+  store i32* %h, i32** @e, align 4
+  ret void
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Thumb/thumb-ldm.ll b/test/CodeGen/Thumb/thumb-ldm.ll
index 95f3edc..7e9560e 100644
--- a/test/CodeGen/Thumb/thumb-ldm.ll
+++ b/test/CodeGen/Thumb/thumb-ldm.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
-; XFAIL: *
+; RUN: llc < %s -mtriple=thumbv6m-eabi -verify-machineinstrs -o - | FileCheck %s
 
 @X = external global [0 x i32]          ; <[0 x i32]*> [#uses=5]
 
diff --git a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
index dedc82b..da2f3f0 100644
--- a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
+++ b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
@@ -1,35 +1,33 @@
-; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s
-; XFAIL: *
-
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
 @d = external global [64 x i32]
 @s = external global [64 x i32]
 
 ; Function Attrs: nounwind
 define void @t1() #0 {
 entry:
-; CHECK: ldr [[REG0:r[0-9]]],
-; CHECK: ldm [[REG0]]!,
-; CHECK: ldr [[REG1:r[0-9]]],
-; CHECK: stm [[REG1]]!,
-; CHECK: subs [[REG0]], #32
-; CHECK-NEXT: ldrb
-; CHECK: subs [[REG1]], #32
-; CHECK-NEXT: strb
-    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 33, i32 4, i1 false)
+; CHECK-LABEL: t1:
+; CHECK: ldr r[[LB:[0-9]]],
+; CHECK-NEXT: ldm r[[LB]]!,
+; CHECK-NEXT: ldr r[[SB:[0-9]]],
+; CHECK-NEXT: stm r[[SB]]!,
+; CHECK-NEXT: ldrb {{.*}}, [r[[LB]]]
+; CHECK-NEXT: strb {{.*}}, [r[[SB]]]
+    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false)
     ret void
 }
 
 ; Function Attrs: nounwind
 define void @t2() #0 {
 entry:
-; CHECK: ldr [[REG0:r[0-9]]],
-; CHECK: ldm [[REG0]]!,
-; CHECK: ldr [[REG1:r[0-9]]],
-; CHECK: stm [[REG1]]!,
-; CHECK: ldrh
-; CHECK: ldrb
-; CHECK: strb
-; CHECK: strh
+; CHECK-LABEL: t2:
+; CHECK: ldr r[[LB:[0-9]]],
+; CHECK-NEXT: ldm r[[LB]]!,
+; CHECK-NEXT: ldr r[[SB:[0-9]]],
+; CHECK-NEXT: stm r[[SB]]!,
+; CHECK-NEXT: ldrh {{.*}}, [r[[LB]]]
+; CHECK-NEXT: ldrb {{.*}}, [r[[LB]], #2]
+; CHECK-NEXT: strb {{.*}}, [r[[SB]], #2]
+; CHECK-NEXT: strh {{.*}}, [r[[SB]]]
     tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false)
     ret void
 }
diff --git a/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll b/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
index c8eac8d..59c2367 100644
--- a/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
+++ b/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
@@ -13,6 +13,7 @@ entry:
 ; CHECK-NOT: mov sp, r7
 ; CHECK: add sp, #8
 	call void @__gcov_flush() nounwind
+	call void @llvm.va_start(i8* null)
 	br i1 undef, label %bb5, label %bb
 
 bb:		; preds = %bb, %entry
@@ -27,3 +28,5 @@ bb5:		; preds = %bb, %entry
 declare hidden void @__gcov_flush()
 
 declare i32 @execvp(i8*, i8**) nounwind
+
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll b/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll
index 524e5a6..89b7148 100644
--- a/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll
+++ b/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts | \
+; RUN: opt < %s -O3 | \
 ; RUN:   llc -mtriple=thumbv7-apple-darwin10 -mattr=+neon | FileCheck %s
 
 define void @fred(i32 %three_by_three, i8* %in, double %dt1, i32 %x_size, i32 %y_size, i8* %bp) nounwind {
diff --git a/test/CodeGen/Thumb2/aapcs.ll b/test/CodeGen/Thumb2/aapcs.ll
new file mode 100644
index 0000000..21af8c1
--- /dev/null
+++ b/test/CodeGen/Thumb2/aapcs.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -mtriple=thumbv7-none-eabi   -mcpu=cortex-m4 -mattr=-vfp2             | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 -mattr=+vfp4,+fp-only-sp | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a8 -mattr=+vfp3             | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP
+
+define float @float_in_reg(float %a, float %b) {
+entry:
+; CHECK-LABEL: float_in_reg:
+; SOFT: mov r0, r1
+; HARD: vmov.f32  s0, s1
+; CHECK-NEXT: bx lr
+  ret float %b
+}
+
+define double @double_in_reg(double %a, double %b) {
+entry:
+; CHECK-LABEL: double_in_reg:
+; SOFT: mov r0, r2
+; SOFT: mov r1, r3
+; SP: vmov.f32  s0, s2
+; SP: vmov.f32  s1, s3
+; DP: vmov.f64  d0, d1
+; CHECK-NEXT: bx lr
+  ret double %b
+}
+
+define float @float_on_stack(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, float %i) {
+; CHECK-LABEL: float_on_stack:
+; SOFT: ldr r0, [sp, #48]
+; HARD: vldr s0, [sp]
+; CHECK-NEXT: bx lr
+  ret float %i
+}
+
+define double @double_on_stack(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) {
+; CHECK-LABEL: double_on_stack:
+; SOFT: ldr r0, [sp, #48]
+; SOFT: ldr r1, [sp, #52]
+; HARD: vldr d0, [sp]
+; CHECK-NEXT: bx lr
+  ret double %i
+}
+
+define double @double_not_split(double %a, double %b, double %c, double %d, double %e, double %f, double %g, float %h, double %i) {
+; CHECK-LABEL: double_not_split:
+; SOFT: ldr r0, [sp, #48]
+; SOFT: ldr r1, [sp, #52]
+; HARD: vldr d0, [sp]
+; CHECK-NEXT: bx lr
+  ret double %i
+}
diff --git a/test/CodeGen/Thumb2/constant-islands-new-island.ll b/test/CodeGen/Thumb2/constant-islands-new-island.ll
new file mode 100644
index 0000000..8ed657e
--- /dev/null
+++ b/test/CodeGen/Thumb2/constant-islands-new-island.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabihf %s -o - | FileCheck %s
+
+; Check that new water is created by splitting the basic block right after the
+; load instruction. Previously, new water was created before the load
+; instruction, which caused the pass to fail to converge.
+
+define void @test(i1 %tst) {
+; CHECK-LABEL: test:
+; CHECK: vldr  {{s[0-9]+}}, [[CONST:\.LCPI[0-9]+_[0-9]+]]
+; CHECK-NEXT: b.w [[CONTINUE:\.LBB[0-9]+_[0-9]+]]
+
+; CHECK: [[CONST]]:
+; CHECK-NEXT: .long
+
+; CHECK: [[CONTINUE]]:
+
+entry:
+  call i32 @llvm.arm.space(i32 2000, i32 undef)
+  br i1 %tst, label %true, label %false
+
+true:
+  %val = phi float [12345.0, %entry], [undef, %false]
+  call void @bar(float %val)
+  ret void
+
+false:
+  br label %true
+}
+
+declare void @bar(float)
+declare i32 @llvm.arm.space(i32, i32)
diff --git a/test/CodeGen/Thumb2/cortex-fp.ll b/test/CodeGen/Thumb2/cortex-fp.ll
index e63970a..5548492e 100644
--- a/test/CodeGen/Thumb2/cortex-fp.ll
+++ b/test/CodeGen/Thumb2/cortex-fp.ll
@@ -1,13 +1,15 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=CORTEXM3
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=CORTEXM4
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-m7 | FileCheck %s -check-prefix=CHECK -check-prefix=CORTEXM7
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK -check-prefix=CORTEXA8
 
 
 define float @foo(float %a, float %b) {
 entry:
 ; CHECK-LABEL: foo:
-; CORTEXM3: blx ___mulsf3
+; CORTEXM3: bl ___mulsf3
 ; CORTEXM4: vmul.f32  s
+; CORTEXM7: vmul.f32  s
 ; CORTEXA8: vmul.f32  d
   %0 = fmul float %a, %b
   ret float %0
@@ -17,8 +19,9 @@ define double @bar(double %a, double %b) {
 entry:
 ; CHECK-LABEL: bar:
   %0 = fmul double %a, %b
-; CORTEXM3: blx ___muldf3
-; CORTEXM4: blx ___muldf3
+; CORTEXM3: bl ___muldf3
+; CORTEXM4: {{bl|b.w}} ___muldf3
+; CORTEXM7: vmul.f64  d
 ; CORTEXA8: vmul.f64  d
   ret double %0
 }
diff --git a/test/CodeGen/Thumb2/float-cmp.ll b/test/CodeGen/Thumb2/float-cmp.ll
new file mode 100644
index 0000000..88d6c3b
--- /dev/null
+++ b/test/CodeGen/Thumb2/float-cmp.ll
@@ -0,0 +1,301 @@
+; RUN: llc < %s -mtriple=thumbv7-none-eabi   -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=NONE
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP
+
+
+
+define i1 @cmp_f_false(float %a, float %b) {
+; CHECK-LABEL: cmp_f_false:
+; NONE: movs r0, #0
+; HARD: movs r0, #0
+  %1 = fcmp false float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_oeq(float %a, float %b) {
+; CHECK-LABEL: cmp_f_oeq:
+; NONE: bl __aeabi_fcmpeq
+; HARD: vcmpe.f32
+; HARD: moveq r0, #1
+  %1 = fcmp oeq float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_ogt(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ogt:
+; NONE: bl __aeabi_fcmpgt
+; HARD: vcmpe.f32
+; HARD: movgt r0, #1
+  %1 = fcmp ogt float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_oge(float %a, float %b) {
+; CHECK-LABEL: cmp_f_oge:
+; NONE: bl __aeabi_fcmpge
+; HARD: vcmpe.f32
+; HARD: movge r0, #1
+  %1 = fcmp oge float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_olt(float %a, float %b) {
+; CHECK-LABEL: cmp_f_olt:
+; NONE: bl __aeabi_fcmplt
+; HARD: vcmpe.f32
+; HARD: movmi r0, #1
+  %1 = fcmp olt float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_ole(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ole:
+; NONE: bl __aeabi_fcmple
+; HARD: vcmpe.f32
+; HARD: movls r0, #1
+  %1 = fcmp ole float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_one(float %a, float %b) {
+; CHECK-LABEL: cmp_f_one:
+; NONE: bl __aeabi_fcmpgt
+; NONE: bl __aeabi_fcmplt
+; HARD: vcmpe.f32
+; HARD: movmi r0, #1
+; HARD: movgt r0, #1
+  %1 = fcmp one float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_ord(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ord:
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: movvc r0, #1
+  %1 = fcmp ord float %a, %b
+  ret i1 %1
+}define i1 @cmp_f_ueq(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ueq:
+; NONE: bl __aeabi_fcmpeq
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: moveq r0, #1
+; HARD: movvs r0, #1
+  %1 = fcmp ueq float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_ugt(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ugt:
+; NONE: bl __aeabi_fcmpgt
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: movhi r0, #1
+  %1 = fcmp ugt float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_uge(float %a, float %b) {
+; CHECK-LABEL: cmp_f_uge:
+; NONE: bl __aeabi_fcmpge
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: movpl r0, #1
+  %1 = fcmp uge float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_ult(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ult:
+; NONE: bl __aeabi_fcmplt
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: movlt r0, #1
+  %1 = fcmp ult float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_ule(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ule:
+; NONE: bl __aeabi_fcmple
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: movle r0, #1
+  %1 = fcmp ule float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_une(float %a, float %b) {
+; CHECK-LABEL: cmp_f_une:
+; NONE: bl __aeabi_fcmpeq
+; HARD: vcmpe.f32
+; HARD: movne r0, #1
+  %1 = fcmp une float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_uno(float %a, float %b) {
+; CHECK-LABEL: cmp_f_uno:
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: movvs r0, #1
+  %1 = fcmp uno float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_true(float %a, float %b) {
+; CHECK-LABEL: cmp_f_true:
+; NONE: movs r0, #1
+; HARD: movs r0, #1
+  %1 = fcmp true float %a, %b
+  ret i1 %1
+}
+
+define i1 @cmp_d_false(double %a, double %b) {
+; CHECK-LABEL: cmp_d_false:
+; NONE: movs r0, #0
+; HARD: movs r0, #0
+  %1 = fcmp false double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_oeq(double %a, double %b) {
+; CHECK-LABEL: cmp_d_oeq:
+; NONE: bl __aeabi_dcmpeq
+; SP: bl __aeabi_dcmpeq
+; DP: vcmpe.f64
+; DP: moveq r0, #1
+  %1 = fcmp oeq double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_ogt(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ogt:
+; NONE: bl __aeabi_dcmpgt
+; SP: bl __aeabi_dcmpgt
+; DP: vcmpe.f64
+; DP: movgt r0, #1
+  %1 = fcmp ogt double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_oge(double %a, double %b) {
+; CHECK-LABEL: cmp_d_oge:
+; NONE: bl __aeabi_dcmpge
+; SP: bl __aeabi_dcmpge
+; DP: vcmpe.f64
+; DP: movge r0, #1
+  %1 = fcmp oge double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_olt(double %a, double %b) {
+; CHECK-LABEL: cmp_d_olt:
+; NONE: bl __aeabi_dcmplt
+; SP: bl __aeabi_dcmplt
+; DP: vcmpe.f64
+; DP: movmi r0, #1
+  %1 = fcmp olt double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_ole(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ole:
+; NONE: bl __aeabi_dcmple
+; SP: bl __aeabi_dcmple
+; DP: vcmpe.f64
+; DP: movls r0, #1
+  %1 = fcmp ole double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_one(double %a, double %b) {
+; CHECK-LABEL: cmp_d_one:
+; NONE: bl __aeabi_dcmpgt
+; NONE: bl __aeabi_dcmplt
+; SP: bl __aeabi_dcmpgt
+; SP: bl __aeabi_dcmplt
+; DP: vcmpe.f64
+; DP: movmi r0, #1
+; DP: movgt r0, #1
+  %1 = fcmp one double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_ord(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ord:
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: movvc r0, #1
+  %1 = fcmp ord double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_ugt(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ugt:
+; NONE: bl __aeabi_dcmpgt
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmpgt
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: movhi r0, #1
+  %1 = fcmp ugt double %a, %b
+  ret i1 %1
+}
+
+define i1 @cmp_d_ult(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ult:
+; NONE: bl __aeabi_dcmplt
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmplt
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: movlt r0, #1
+  %1 = fcmp ult double %a, %b
+  ret i1 %1
+}
+
+
+define i1 @cmp_d_uno(double %a, double %b) {
+; CHECK-LABEL: cmp_d_uno:
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: movvs r0, #1
+  %1 = fcmp uno double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_true(double %a, double %b) {
+; CHECK-LABEL: cmp_d_true:
+; NONE: movs r0, #1
+; HARD: movs r0, #1
+  %1 = fcmp true double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_ueq(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ueq:
+; NONE: bl __aeabi_dcmpeq
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmpeq
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: moveq r0, #1
+; DP: movvs r0, #1
+  %1 = fcmp ueq double %a, %b
+  ret i1 %1
+}
+
+define i1 @cmp_d_uge(double %a, double %b) {
+; CHECK-LABEL: cmp_d_uge:
+; NONE: bl __aeabi_dcmpge
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmpge
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: movpl r0, #1
+  %1 = fcmp uge double %a, %b
+  ret i1 %1
+}
+
+define i1 @cmp_d_ule(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ule:
+; NONE: bl __aeabi_dcmple
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmple
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: movle r0, #1
+  %1 = fcmp ule double %a, %b
+  ret i1 %1
+}
+
+define i1 @cmp_d_une(double %a, double %b) {
+; CHECK-LABEL: cmp_d_une:
+; NONE: bl __aeabi_dcmpeq
+; SP: bl __aeabi_dcmpeq
+; DP: vcmpe.f64
+; DP: movne r0, #1
+  %1 = fcmp une double %a, %b
+  ret i1 %1
+}
diff --git a/test/CodeGen/Thumb2/float-intrinsics-double.ll b/test/CodeGen/Thumb2/float-intrinsics-double.ll
new file mode 100644
index 0000000..01a23bd
--- /dev/null
+++ b/test/CodeGen/Thumb2/float-intrinsics-double.ll
@@ -0,0 +1,228 @@
+; RUN: llc < %s -mtriple=thumbv7-none-eabi   -mcpu=cortex-m3                    | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=NONE
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4                    | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=SP
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7                    | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP  -check-prefix=FP-ARMv8
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 -mattr=+fp-only-sp | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=SP
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a7                    | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=VFP4
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a57                   | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=FP-ARMv8
+
+declare double     @llvm.sqrt.f64(double %Val)
+define double @sqrt_d(double %a) {
+; CHECK-LABEL: sqrt_d:
+; SOFT: {{(bl|b)}} sqrt
+; HARD: vsqrt.f64 d0, d0
+  %1 = call double @llvm.sqrt.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.powi.f64(double %Val, i32 %power)
+define double @powi_d(double %a, i32 %b) {
+; CHECK-LABEL: powi_d:
+; SOFT: {{(bl|b)}} __powidf2
+; HARD: b __powidf2
+  %1 = call double @llvm.powi.f64(double %a, i32 %b)
+  ret double %1
+}
+
+declare double     @llvm.sin.f64(double %Val)
+define double @sin_d(double %a) {
+; CHECK-LABEL: sin_d:
+; SOFT: {{(bl|b)}} sin
+; HARD: b sin
+  %1 = call double @llvm.sin.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.cos.f64(double %Val)
+define double @cos_d(double %a) {
+; CHECK-LABEL: cos_d:
+; SOFT: {{(bl|b)}} cos
+; HARD: b cos
+  %1 = call double @llvm.cos.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.pow.f64(double %Val, double %power)
+define double @pow_d(double %a, double %b) {
+; CHECK-LABEL: pow_d:
+; SOFT: {{(bl|b)}} pow
+; HARD: b pow
+  %1 = call double @llvm.pow.f64(double %a, double %b)
+  ret double %1
+}
+
+declare double     @llvm.exp.f64(double %Val)
+define double @exp_d(double %a) {
+; CHECK-LABEL: exp_d:
+; SOFT: {{(bl|b)}} exp
+; HARD: b exp
+  %1 = call double @llvm.exp.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.exp2.f64(double %Val)
+define double @exp2_d(double %a) {
+; CHECK-LABEL: exp2_d:
+; SOFT: {{(bl|b)}} exp2
+; HARD: b exp2
+  %1 = call double @llvm.exp2.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.log.f64(double %Val)
+define double @log_d(double %a) {
+; CHECK-LABEL: log_d:
+; SOFT: {{(bl|b)}} log
+; HARD: b log
+  %1 = call double @llvm.log.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.log10.f64(double %Val)
+define double @log10_d(double %a) {
+; CHECK-LABEL: log10_d:
+; SOFT: {{(bl|b)}} log10
+; HARD: b log10
+  %1 = call double @llvm.log10.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.log2.f64(double %Val)
+define double @log2_d(double %a) {
+; CHECK-LABEL: log2_d:
+; SOFT: {{(bl|b)}} log2
+; HARD: b log2
+  %1 = call double @llvm.log2.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.fma.f64(double %a, double %b, double %c)
+define double @fma_d(double %a, double %b, double %c) {
+; CHECK-LABEL: fma_d:
+; SOFT: {{(bl|b)}} fma
+; HARD: vfma.f64
+  %1 = call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %1
+}
+
+; FIXME: the FPv4-SP version is less efficient than the no-FPU version
+declare double     @llvm.fabs.f64(double %Val)
+define double @abs_d(double %a) {
+; CHECK-LABEL: abs_d:
+; NONE: bic r1, r1, #-2147483648
+; SP: bl __aeabi_dcmpgt
+; SP: bl __aeabi_dcmpun
+; SP: bl __aeabi_dsub
+; DP: vabs.f64 d0, d0
+  %1 = call double @llvm.fabs.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.copysign.f64(double  %Mag, double  %Sgn)
+define double @copysign_d(double %a, double %b) {
+; CHECK-LABEL: copysign_d:
+; SOFT: lsrs [[REG:r[0-9]+]], r3, #31
+; SOFT: bfi r1, [[REG]], #31, #1
+; VFP: lsrs [[REG:r[0-9]+]], r3, #31
+; VFP: bfi r1, [[REG]], #31, #1
+; NEON: vmov.i32 [[REG:d[0-9]+]], #0x80000000
+; NEON: vshl.i64 [[REG]], [[REG]], #32
+; NEON: vbsl [[REG]], d
+  %1 = call double @llvm.copysign.f64(double %a, double %b)
+  ret double %1
+}
+
+declare double     @llvm.floor.f64(double %Val)
+define double @floor_d(double %a) {
+; CHECK-LABEL: floor_d:
+; SOFT: {{(bl|b)}} floor
+; VFP4: b floor
+; FP-ARMv8: vrintm.f64
+  %1 = call double @llvm.floor.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.ceil.f64(double %Val)
+define double @ceil_d(double %a) {
+; CHECK-LABEL: ceil_d:
+; SOFT: {{(bl|b)}} ceil
+; VFP4: b ceil
+; FP-ARMv8: vrintp.f64
+  %1 = call double @llvm.ceil.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.trunc.f64(double %Val)
+define double @trunc_d(double %a) {
+; CHECK-LABEL: trunc_d:
+; SOFT: {{(bl|b)}} trunc
+; FFP4: b trunc
+; FP-ARMv8: vrintz.f64
+  %1 = call double @llvm.trunc.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.rint.f64(double %Val)
+define double @rint_d(double %a) {
+; CHECK-LABEL: rint_d:
+; SOFT: {{(bl|b)}} rint
+; VFP4: b rint
+; FP-ARMv8: vrintx.f64
+  %1 = call double @llvm.rint.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.nearbyint.f64(double %Val)
+define double @nearbyint_d(double %a) {
+; CHECK-LABEL: nearbyint_d:
+; SOFT: {{(bl|b)}} nearbyint
+; VFP4: b nearbyint
+; FP-ARMv8: vrintr.f64
+  %1 = call double @llvm.nearbyint.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.round.f64(double %Val)
+define double @round_d(double %a) {
+; CHECK-LABEL: round_d:
+; SOFT: {{(bl|b)}} round
+; VFP4: b round
+; FP-ARMv8: vrinta.f64
+  %1 = call double @llvm.round.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.fmuladd.f64(double %a, double %b, double %c)
+define double @fmuladd_d(double %a, double %b, double %c) {
+; CHECK-LABEL: fmuladd_d:
+; SOFT: bl __aeabi_dmul
+; SOFT: bl __aeabi_dadd
+; VFP4: vmul.f64
+; VFP4: vadd.f64
+; FP-ARMv8: vmla.f64
+  %1 = call double @llvm.fmuladd.f64(double %a, double %b, double %c)
+  ret double %1
+}
+
+declare i16 @llvm.convert.to.fp16.f64(double %a)
+define i16 @d_to_h(double %a) {
+; CHECK-LABEL: d_to_h:
+; SOFT: bl __aeabi_d2h
+; VFP4: bl __aeabi_d2h
+; FP-ARMv8: vcvt{{[bt]}}.f16.f64
+  %1 = call i16 @llvm.convert.to.fp16.f64(double %a)
+  ret i16 %1
+}
+
+declare double @llvm.convert.from.fp16.f64(i16 %a)
+define double @h_to_d(i16 %a) {
+; CHECK-LABEL: h_to_d:
+; NONE: bl __gnu_h2f_ieee
+; NONE: bl __aeabi_f2d
+; SP: vcvt{{[bt]}}.f32.f16
+; SP: bl __aeabi_f2d
+; VFPv4: vcvt{{[bt]}}.f32.f16
+; VFPv4: vcvt.f64.f32
+; FP-ARMv8: vcvt{{[bt]}}.f64.f16
+  %1 = call double @llvm.convert.from.fp16.f64(i16 %a)
+  ret double %1
+}
diff --git a/test/CodeGen/Thumb2/float-intrinsics-float.ll b/test/CodeGen/Thumb2/float-intrinsics-float.ll
new file mode 100644
index 0000000..ec1bcd3
--- /dev/null
+++ b/test/CodeGen/Thumb2/float-intrinsics-float.ll
@@ -0,0 +1,221 @@
+; RUN: llc < %s -mtriple=thumbv7-none-eabi   -mcpu=cortex-m3                    | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=NONE
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4                    | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=VMLA
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7                    | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP  -check-prefix=FP-ARMv8  -check-prefix=VMLA
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 -mattr=+fp-only-sp | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=FP-ARMv8 -check-prefix=VMLA
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a7                    | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=VFP4 -check-prefix=NO-VMLA
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a57                   | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=FP-ARMv8 -check-prefix=VMLA
+
+declare float     @llvm.sqrt.f32(float %Val)
+define float @sqrt_f(float %a) {
+; CHECK-LABEL: sqrt_f:
+; SOFT: bl sqrtf
+; HARD: vsqrt.f32 s0, s0
+  %1 = call float @llvm.sqrt.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.powi.f32(float %Val, i32 %power)
+define float @powi_f(float %a, i32 %b) {
+; CHECK-LABEL: powi_f:
+; SOFT: bl __powisf2
+; HARD: b __powisf2
+  %1 = call float @llvm.powi.f32(float %a, i32 %b)
+  ret float %1
+}
+
+declare float     @llvm.sin.f32(float %Val)
+define float @sin_f(float %a) {
+; CHECK-LABEL: sin_f:
+; SOFT: bl sinf
+; HARD: b sinf
+  %1 = call float @llvm.sin.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.cos.f32(float %Val)
+define float @cos_f(float %a) {
+; CHECK-LABEL: cos_f:
+; SOFT: bl cosf
+; HARD: b cosf
+  %1 = call float @llvm.cos.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.pow.f32(float %Val, float %power)
+define float @pow_f(float %a, float %b) {
+; CHECK-LABEL: pow_f:
+; SOFT: bl powf
+; HARD: b powf
+  %1 = call float @llvm.pow.f32(float %a, float %b)
+  ret float %1
+}
+
+declare float     @llvm.exp.f32(float %Val)
+define float @exp_f(float %a) {
+; CHECK-LABEL: exp_f:
+; SOFT: bl expf
+; HARD: b expf
+  %1 = call float @llvm.exp.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.exp2.f32(float %Val)
+define float @exp2_f(float %a) {
+; CHECK-LABEL: exp2_f:
+; SOFT: bl exp2f
+; HARD: b exp2f
+  %1 = call float @llvm.exp2.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.log.f32(float %Val)
+define float @log_f(float %a) {
+; CHECK-LABEL: log_f:
+; SOFT: bl logf
+; HARD: b logf
+  %1 = call float @llvm.log.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.log10.f32(float %Val)
+define float @log10_f(float %a) {
+; CHECK-LABEL: log10_f:
+; SOFT: bl log10f
+; HARD: b log10f
+  %1 = call float @llvm.log10.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.log2.f32(float %Val)
+define float @log2_f(float %a) {
+; CHECK-LABEL: log2_f:
+; SOFT: bl log2f
+; HARD: b log2f
+  %1 = call float @llvm.log2.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.fma.f32(float %a, float %b, float %c)
+define float @fma_f(float %a, float %b, float %c) {
+; CHECK-LABEL: fma_f:
+; SOFT: bl fmaf
+; HARD: vfma.f32
+  %1 = call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %1
+}
+
+declare float     @llvm.fabs.f32(float %Val)
+define float @abs_f(float %a) {
+; CHECK-LABEL: abs_f:
+; SOFT: bic r0, r0, #-2147483648
+; HARD: vabs.f32
+  %1 = call float @llvm.fabs.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.copysign.f32(float  %Mag, float  %Sgn)
+define float @copysign_f(float %a, float %b) {
+; CHECK-LABEL: copysign_f:
+; NONE: lsrs [[REG:r[0-9]+]], r{{[0-9]+}}, #31
+; NONE: bfi r{{[0-9]+}}, [[REG]], #31, #1
+; SP: lsrs [[REG:r[0-9]+]], r{{[0-9]+}}, #31
+; SP: bfi r{{[0-9]+}}, [[REG]], #31, #1
+; VFP: lsrs [[REG:r[0-9]+]], r{{[0-9]+}}, #31
+; VFP: bfi r{{[0-9]+}}, [[REG]], #31, #1
+; NEON: vmov.i32 [[REG:d[0-9]+]], #0x80000000
+; NEON: vbsl [[REG]], d
+  %1 = call float @llvm.copysign.f32(float %a, float %b)
+  ret float %1
+}
+
+declare float     @llvm.floor.f32(float %Val)
+define float @floor_f(float %a) {
+; CHECK-LABEL: floor_f:
+; SOFT: bl floorf
+; VFP4: b floorf
+; FP-ARMv8: vrintm.f32
+  %1 = call float @llvm.floor.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.ceil.f32(float %Val)
+define float @ceil_f(float %a) {
+; CHECK-LABEL: ceil_f:
+; SOFT: bl ceilf
+; VFP4: b ceilf
+; FP-ARMv8: vrintp.f32
+  %1 = call float @llvm.ceil.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.trunc.f32(float %Val)
+define float @trunc_f(float %a) {
+; CHECK-LABEL: trunc_f:
+; SOFT: bl truncf
+; VFP4: b truncf
+; FP-ARMv8: vrintz.f32
+  %1 = call float @llvm.trunc.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.rint.f32(float %Val)
+define float @rint_f(float %a) {
+; CHECK-LABEL: rint_f:
+; SOFT: bl rintf
+; VFP4: b rintf
+; FP-ARMv8: vrintx.f32
+  %1 = call float @llvm.rint.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.nearbyint.f32(float %Val)
+define float @nearbyint_f(float %a) {
+; CHECK-LABEL: nearbyint_f:
+; SOFT: bl nearbyintf
+; VFP4: b nearbyintf
+; FP-ARMv8: vrintr.f32
+  %1 = call float @llvm.nearbyint.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.round.f32(float %Val)
+define float @round_f(float %a) {
+; CHECK-LABEL: round_f:
+; SOFT: bl roundf
+; VFP4: b roundf
+; FP-ARMv8: vrinta.f32
+  %1 = call float @llvm.round.f32(float %a)
+  ret float %1
+}
+
+; FIXME: why does cortex-m4 use vmla, while cortex-a7 uses vmul+vadd?
+; (these should be equivalent, even the rounding is the same)
+declare float     @llvm.fmuladd.f32(float %a, float %b, float %c)
+define float @fmuladd_f(float %a, float %b, float %c) {
+; CHECK-LABEL: fmuladd_f:
+; SOFT: bl __aeabi_fmul
+; SOFT: bl __aeabi_fadd
+; VMLA: vmla.f32
+; NO-VMLA: vmul.f32
+; NO-VMLA: vadd.f32
+  %1 = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  ret float %1
+}
+
+declare i16 @llvm.convert.to.fp16.f32(float %a)
+define i16 @f_to_h(float %a) {
+; CHECK-LABEL: f_to_h:
+; SOFT: bl __gnu_f2h_ieee
+; HARD: vcvt{{[bt]}}.f16.f32
+  %1 = call i16 @llvm.convert.to.fp16.f32(float %a)
+  ret i16 %1
+}
+
+declare float @llvm.convert.from.fp16.f32(i16 %a)
+define float @h_to_f(i16 %a) {
+; CHECK-LABEL: h_to_f:
+; SOFT: bl __gnu_h2f_ieee
+; HARD: vcvt{{[bt]}}.f32.f16
+  %1 = call float @llvm.convert.from.fp16.f32(i16 %a)
+  ret float %1
+}
diff --git a/test/CodeGen/Thumb2/float-ops.ll b/test/CodeGen/Thumb2/float-ops.ll
new file mode 100644
index 0000000..d383065
--- /dev/null
+++ b/test/CodeGen/Thumb2/float-ops.ll
@@ -0,0 +1,293 @@
+; RUN: llc < %s -mtriple=thumbv7-none-eabi   -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=NONE
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=VFP4-ALL
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=FP-ARMv8
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP4-ALL -check-prefix=VFP4-DP
+
+define float @add_f(float %a, float %b) {
+entry:
+; CHECK-LABEL: add_f:
+; NONE: bl __aeabi_fadd
+; HARD: vadd.f32  s0, s0, s1
+  %0 = fadd float %a, %b
+  ret float %0
+}
+
+define double @add_d(double %a, double %b) {
+entry:
+; CHECK-LABEL: add_d:
+; NONE: bl __aeabi_dadd
+; SP: bl __aeabi_dadd
+; DP: vadd.f64  d0, d0, d1
+  %0 = fadd double %a, %b
+  ret double %0
+}
+
+define float @sub_f(float %a, float %b) {
+entry:
+; CHECK-LABEL: sub_f:
+; NONE: bl __aeabi_fsub
+; HARD: vsub.f32  s
+  %0 = fsub float %a, %b
+  ret float %0
+}
+
+define double @sub_d(double %a, double %b) {
+entry:
+; CHECK-LABEL: sub_d:
+; NONE: bl __aeabi_dsub
+; SP: bl __aeabi_dsub
+; DP: vsub.f64  d0, d0, d1
+  %0 = fsub double %a, %b
+  ret double %0
+}
+
+define float @mul_f(float %a, float %b) {
+entry:
+; CHECK-LABEL: mul_f:
+; NONE: bl __aeabi_fmul
+; HARD: vmul.f32  s
+  %0 = fmul float %a, %b
+  ret float %0
+}
+
+define double @mul_d(double %a, double %b) {
+entry:
+; CHECK-LABEL: mul_d:
+; NONE: bl __aeabi_dmul
+; SP: bl __aeabi_dmul
+; DP: vmul.f64  d0, d0, d1
+  %0 = fmul double %a, %b
+  ret double %0
+}
+
+define float @div_f(float %a, float %b) {
+entry:
+; CHECK-LABEL: div_f:
+; NONE: bl __aeabi_fdiv
+; HARD: vdiv.f32  s
+  %0 = fdiv float %a, %b
+  ret float %0
+}
+
+define double @div_d(double %a, double %b) {
+entry:
+; CHECK-LABEL: div_d:
+; NONE: bl __aeabi_ddiv
+; SP: bl __aeabi_ddiv
+; DP: vdiv.f64  d0, d0, d1
+  %0 = fdiv double %a, %b
+  ret double %0
+}
+
+define float @rem_f(float %a, float %b) {
+entry:
+; CHECK-LABEL: rem_f:
+; NONE: bl fmodf
+; HARD: b fmodf
+  %0 = frem float %a, %b
+  ret float %0
+}
+
+define double @rem_d(double %a, double %b) {
+entry:
+; CHECK-LABEL: rem_d:
+; NONE: bl fmod
+; HARD: b fmod
+  %0 = frem double %a, %b
+  ret double %0
+}
+
+define float @load_f(float* %a) {
+entry:
+; CHECK-LABEL: load_f:
+; NONE: ldr r0, [r0]
+; HARD: vldr s0, [r0]
+  %0 = load float* %a, align 4
+  ret float %0
+}
+
+define double @load_d(double* %a) {
+entry:
+; CHECK-LABEL: load_d:
+; NONE: ldm.w r0, {r0, r1}
+; HARD: vldr d0, [r0]
+  %0 = load double* %a, align 8
+  ret double %0
+}
+
+define void @store_f(float* %a, float %b) {
+entry:
+; CHECK-LABEL: store_f:
+; NONE: str r1, [r0]
+; HARD: vstr s0, [r0]
+  store float %b, float* %a, align 4
+  ret void
+}
+
+define void @store_d(double* %a, double %b) {
+entry:
+; CHECK-LABEL: store_d:
+; NONE: mov r1, r3
+; NONE: str r2, [r0]
+; NONE: str r1, [r0, #4]
+; HARD: vstr d0, [r0]
+  store double %b, double* %a, align 8
+  ret void
+}
+
+define double @f_to_d(float %a) {
+; CHECK-LABEL: f_to_d:
+; NONE: bl __aeabi_f2d
+; SP: bl __aeabi_f2d
+; DP: vcvt.f64.f32 d0, s0
+  %1 = fpext float %a to double
+  ret double %1
+}
+
+define float @d_to_f(double %a) {
+; CHECK-LABEL: d_to_f:
+; NONE: bl __aeabi_d2f
+; SP: bl __aeabi_d2f
+; DP: vcvt.f32.f64 s0, d0
+  %1 = fptrunc double %a to float
+  ret float %1
+}
+
+define i32 @f_to_si(float %a) {
+; CHECK-LABEL: f_to_si:
+; NONE: bl __aeabi_f2iz
+; HARD: vcvt.s32.f32 s0, s0
+; HARD: vmov r0, s0
+  %1 = fptosi float %a to i32
+  ret i32 %1
+}
+
+define i32 @d_to_si(double %a) {
+; CHECK-LABEL: d_to_si:
+; NONE: bl __aeabi_d2iz
+; SP: vmov r0, r1, d0
+; SP: bl __aeabi_d2iz
+; DP: vcvt.s32.f64 s0, d0
+; DP: vmov r0, s0
+  %1 = fptosi double %a to i32
+  ret i32 %1
+}
+
+define i32 @f_to_ui(float %a) {
+; CHECK-LABEL: f_to_ui:
+; NONE: bl __aeabi_f2uiz
+; HARD: vcvt.u32.f32 s0, s0
+; HARD: vmov r0, s0
+  %1 = fptoui float %a to i32
+  ret i32 %1
+}
+
+define i32 @d_to_ui(double %a) {
+; CHECK-LABEL: d_to_ui:
+; NONE: bl __aeabi_d2uiz
+; SP: vmov r0, r1, d0
+; SP: bl __aeabi_d2uiz
+; DP: vcvt.u32.f64 s0, d0
+; DP: vmov r0, s0
+  %1 = fptoui double %a to i32
+  ret i32 %1
+}
+
+define float @si_to_f(i32 %a) {
+; CHECK-LABEL: si_to_f:
+; NONE: bl __aeabi_i2f
+; HARD: vcvt.f32.s32 s0, s0
+  %1 = sitofp i32 %a to float
+  ret float %1
+}
+
+define double @si_to_d(i32 %a) {
+; CHECK-LABEL: si_to_d:
+; NONE: bl __aeabi_i2d
+; SP: bl __aeabi_i2d
+; DP: vcvt.f64.s32 d0, s0
+  %1 = sitofp i32 %a to double
+  ret double %1
+}
+
+define float @ui_to_f(i32 %a) {
+; CHECK-LABEL: ui_to_f:
+; NONE: bl __aeabi_ui2f
+; HARD: vcvt.f32.u32 s0, s0
+  %1 = uitofp i32 %a to float
+  ret float %1
+}
+
+define double @ui_to_d(i32 %a) {
+; CHECK-LABEL: ui_to_d:
+; NONE: bl __aeabi_ui2d
+; SP: bl __aeabi_ui2d
+; DP: vcvt.f64.u32 d0, s0
+  %1 = uitofp i32 %a to double
+  ret double %1
+}
+
+define float @bitcast_i_to_f(i32 %a) {
+; CHECK-LABEL: bitcast_i_to_f:
+; NONE-NOT: mov
+; HARD: vmov s0, r0
+  %1 = bitcast i32 %a to float
+  ret float %1
+}
+
+define double @bitcast_i_to_d(i64 %a) {
+; CHECK-LABEL: bitcast_i_to_d:
+; NONE-NOT: mov
+; HARD: vmov d0, r0, r1
+  %1 = bitcast i64 %a to double
+  ret double %1
+}
+
+define i32 @bitcast_f_to_i(float %a) {
+; CHECK-LABEL: bitcast_f_to_i:
+; NONE-NOT: mov
+; HARD: vmov r0, s0
+  %1 = bitcast float %a to i32
+  ret i32 %1
+}
+
+define i64 @bitcast_d_to_i(double %a) {
+; CHECK-LABEL: bitcast_d_to_i:
+; NONE-NOT: mov
+; HARD: vmov r0, r1, d0
+  %1 = bitcast double %a to i64
+  ret i64 %1
+}
+
+define float @select_f(float %a, float %b, i1 %c) {
+; CHECK-LABEL: select_f:
+; NONE: tst.w   r2, #1
+; NONE: moveq   r0, r1
+; HARD: tst.w   r0, #1
+; VFP4-ALL: vmovne.f32      s1, s0
+; VFP4-ALL: vmov.f32        s0, s1
+; FP-ARMv8: vseleq.f32 s0, s1, s0
+  %1 = select i1 %c, float %a, float %b
+  ret float %1
+}
+
+define double @select_d(double %a, double %b, i1 %c) {
+; CHECK-LABEL: select_d:
+; NONE: ldr.w   [[REG:r[0-9]+]], [sp]
+; NONE: ands    [[REG]], [[REG]], #1
+; NONE: moveq   r0, r2
+; NONE: moveq   r1, r3
+; SP: ands r0, r0, #1
+; SP-DAG: vmov [[ALO:r[0-9]+]], [[AHI:r[0-9]+]], d0
+; SP-DAG: vmov [[BLO:r[0-9]+]], [[BHI:r[0-9]+]], d1
+; SP: itt ne
+; SP-DAG: movne [[BLO]], [[ALO]]
+; SP-DAG: movne [[BHI]], [[AHI]]
+; SP: vmov d0, [[BLO]], [[BHI]]
+; DP: tst.w   r0, #1
+; VFP4-DP: vmovne.f64      d1, d0
+; VFP4-DP: vmov.f64        d0, d1
+; FP-ARMV8: vseleq.f64      d0, d1, d0
+  %1 = select i1 %c, double %a, double %b
+  ret double %1
+}
diff --git a/test/CodeGen/Thumb2/stack_guard_remat.ll b/test/CodeGen/Thumb2/stack_guard_remat.ll
new file mode 100644
index 0000000..c8ea871
--- /dev/null
+++ b/test/CodeGen/Thumb2/stack_guard_remat.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=STATIC
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s  -check-prefix=DYNAMIC-NO-PIC
+
+;PIC:   foo2
+;PIC:   movw  [[R0:r[0-9]+]], :lower16:(L___stack_chk_guard$non_lazy_ptr-([[LABEL0:LPC[0-9_]+]]+4))
+;PIC:   movt  [[R0]], :upper16:(L___stack_chk_guard$non_lazy_ptr-([[LABEL0]]+4))
+;PIC: [[LABEL0]]:
+;PIC:   add [[R0]], pc
+;PIC:   ldr [[R1:r[0-9]+]], {{\[}}[[R0]]{{\]}}
+;PIC:   ldr {{r[0-9]+}}, {{\[}}[[R1]]{{\]}}
+
+;STATIC:   foo2
+;STATIC:   movw  [[R0:r[0-9]+]], :lower16:___stack_chk_guard
+;STATIC:   movt  [[R0]], :upper16:___stack_chk_guard
+;STATIC:   ldr {{r[0-9]+}}, {{\[}}[[R0]]{{\]}}
+
+;DYNAMIC-NO-PIC:   foo2
+;DYNAMIC-NO-PIC:   movw  [[R0:r[0-9]+]], :lower16:L___stack_chk_guard$non_lazy_ptr
+;DYNAMIC-NO-PIC:   movt  [[R0]], :upper16:L___stack_chk_guard$non_lazy_ptr
+;DYNAMIC-NO-PIC:   ldr {{r[0-9]+}}, {{\[}}[[R0]]{{\]}}
+
+; Function Attrs: nounwind ssp
+define i32 @test_stack_guard_remat() #0 {
+  %a1 = alloca [256 x i32], align 4
+  %1 = bitcast [256 x i32]* %a1 to i8*
+  call void @llvm.lifetime.start(i64 1024, i8* %1)
+  %2 = getelementptr inbounds [256 x i32]* %a1, i32 0, i32 0
+  call void @foo3(i32* %2) #3
+  call void asm sideeffect "foo2", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{sp},~{lr}"()
+  call void @llvm.lifetime.end(i64 1024, i8* %1)
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
index cef3490..02a8c47 100644
--- a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
@@ -2,15 +2,15 @@
 ; RUN:  | FileCheck %s
 
 define i32 @test0(i8 %A) {
-; CHECK: test0
+; CHECK-LABEL: test0:
 ; CHECK: sxtb r0, r0
         %B = sext i8 %A to i32
 	ret i32 %B
 }
 
 define signext i8 @test1(i32 %A)  {
-; CHECK: test1
-; CHECK: sxtb.w r0, r0, ror #8
+; CHECK-LABEL: test1:
+; CHECK: sbfx r0, r0, #8, #8
 	%B = lshr i32 %A, 8
 	%C = shl i32 %A, 24
 	%D = or i32 %B, %C
@@ -19,9 +19,8 @@ define signext i8 @test1(i32 %A)  {
 }
 
 define signext i32 @test2(i32 %A, i32 %X)  {
-; CHECK: test2
-; CHECK: lsrs r0, r0, #8
-; CHECK: sxtab  r0, r1, r0
+; CHECK-LABEL: test2:
+; CHECK: sxtab  r0, r1, r0, ror #8
 	%B = lshr i32 %A, 8
 	%C = shl i32 %A, 24
 	%D = or i32 %B, %C
@@ -30,3 +29,14 @@ define signext i32 @test2(i32 %A, i32 %X)  {
         %G = add i32 %F, %X
 	ret i32 %G
 }
+
+define i32 @test3(i32 %A, i32 %X) {
+; CHECK-LABEL: test3:
+; CHECK: sxtah r0, r0, r1, ror #8
+  %X.hi = lshr i32 %X, 8
+  %X.trunc = trunc i32 %X.hi to i16
+  %addend = sext i16 %X.trunc to i32
+
+  %sum = add i32 %A, %addend
+  ret i32 %sum
+}
diff --git a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
index bcd4a0f..4afea89 100644
--- a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
@@ -24,8 +24,8 @@ define zeroext i32 @test2(i32 %A.u, i32 %B.u)  {
 }
 
 define zeroext i32 @test3(i32 %A.u)  {
-; A8: test3
-; A8: uxth.w r0, r0, ror #8
+; A8-LABEL: test3
+; A8: ubfx  r0, r0, #8, #16
     %B.u = lshr i32 %A.u, 8
     %C.u = shl i32 %A.u, 24
     %D.u = or i32 %B.u, %C.u
@@ -33,3 +33,25 @@ define zeroext i32 @test3(i32 %A.u)  {
     %F.u = zext i16 %E.u to i32
     ret i32 %F.u
 }
+
+define i32 @test4(i32 %A, i32 %X) {
+; A8-LABEL: test4:
+; A8: uxtab r0, r0, r1, ror #16
+  %X.hi = lshr i32 %X, 16
+  %X.trunc = trunc i32 %X.hi to i8
+  %addend = zext i8 %X.trunc to i32
+
+  %sum = add i32 %A, %addend
+  ret i32 %sum
+}
+
+define i32 @test5(i32 %A, i32 %X) {
+; A8-LABEL: test5:
+; A8: uxtah r0, r0, r1, ror #8
+  %X.hi = lshr i32 %X, 8
+  %X.trunc = trunc i32 %X.hi to i16
+  %addend = zext i16 %X.trunc to i32
+
+  %sum = add i32 %A, %addend
+  ret i32 %sum
+}
diff --git a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
index 638d399..62c503d 100644
--- a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
+++ b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=i686-pc-linux-gnu | FileCheck %s
 
-@__gthrw_pthread_once = alias weak i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
+@__gthrw_pthread_once = weak alias i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
 
 define weak i32 @pthread_once(i32*, void ()*) {
   ret i32 0
diff --git a/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll b/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll
index d2d5149..35857b7 100644
--- a/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll
+++ b/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep xor | grep CPI
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
 
+; CHECK: xorpd {{.*}}{{LCPI0_0|__xmm@}}
 define void @casin({ double, double }* sret  %agg.result, double %z.0, double %z.1) nounwind  {
 entry:
 	%memtmp = alloca { double, double }, align 8		; <{ double, double }*> [#uses=3]
diff --git a/test/CodeGen/X86/2008-06-18-BadShuffle.ll b/test/CodeGen/X86/2008-06-18-BadShuffle.ll
deleted file mode 100644
index 66f9065..0000000
--- a/test/CodeGen/X86/2008-06-18-BadShuffle.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=i386 -mattr=+sse2 | grep pinsrw
-
-; Test to make sure we actually insert the bottom element of the vector
-define <8 x i16> @a(<8 x i16> %a) nounwind  {
-entry:
-	shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> < i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8 >
-	%add = add <8 x i16> %0, %a
-	ret <8 x i16> %add
-}
-
diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
index 296f0ca..207d122 100644
--- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
@@ -14,9 +14,9 @@ entry:
   %2 = alloca i64                                 ; <i64*> [#uses=1]
   %3 = alloca i64                                 ; <i64*> [#uses=6]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{i8** %s1_addr}, metadata !0), !dbg !7
+  call void @llvm.dbg.declare(metadata !{i8** %s1_addr}, metadata !0, metadata !{metadata !"0x102"}), !dbg !7
   store i8* %s1, i8** %s1_addr
-  call void @llvm.dbg.declare(metadata !{[0 x i8]** %str.0}, metadata !8), !dbg !7
+  call void @llvm.dbg.declare(metadata !{[0 x i8]** %str.0}, metadata !8, metadata !{metadata !"0x102"}), !dbg !7
   %4 = call i8* @llvm.stacksave(), !dbg !7        ; <i8*> [#uses=1]
   store i8* %4, i8** %saved_stack.1, align 8, !dbg !7
   %5 = load i8** %s1_addr, align 8, !dbg !13      ; <i8*> [#uses=1]
@@ -58,7 +58,7 @@ return:                                           ; preds = %entry
   ret i8 %retval12, !dbg !16
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i8* @llvm.stacksave() nounwind
 
@@ -66,21 +66,21 @@ declare i64 @strlen(i8*) nounwind readonly
 
 declare void @llvm.stackrestore(i8*) nounwind
 
-!0 = metadata !{i32 459009, metadata !1, metadata !"s1", metadata !2, i32 2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 458798, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 458769, metadata !17, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 458773, null, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00s1\002\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\000", i32 0, metadata !2, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !17, metadata !18, metadata !18, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !6}
-!5 = metadata !{i32 458788, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ]
+!5 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !5} ; [ DW_TAG_pointer_type ]
 !7 = metadata !{i32 2, i32 0, metadata !1, null}
-!8 = metadata !{i32 459008, metadata !1, metadata !"str.0", metadata !2, i32 3, metadata !9} ; [ DW_TAG_auto_variable ]
-!9 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 458753, null, metadata !2, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !5, metadata !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
+!8 = metadata !{metadata !"0x100\00str.0\003\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", null, metadata !2, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{metadata !"0x1\00\000\008\008\000\000", null, metadata !2, metadata !5, metadata !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 458785, i64 0, i64 1}        ; [ DW_TAG_subrange_type ]
+!12 = metadata !{metadata !"0x21\000\001"}        ; [ DW_TAG_subrange_type ]
 !13 = metadata !{i32 3, i32 0, metadata !14, null}
-!14 = metadata !{i32 458763, metadata !17, metadata !1, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\000\000\000", metadata !17, metadata !1} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{i32 4, i32 0, metadata !14, null}
 !16 = metadata !{i32 5, i32 0, metadata !14, null}
 !17 = metadata !{metadata !"vla.c", metadata !"/tmp/"}
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 764c2cd..e046b96 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "4 machine-licm"
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "7 machine-licm"
 ; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037
diff --git a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
deleted file mode 100644
index e1930e0..0000000
--- a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -mtriple=i386-apple-darwin10.0 -relocation-model=pic -asm-verbose=false \
-; RUN:     -mcpu=generic -disable-fp-elim -mattr=-sse4.1,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
-; RUN:   FileCheck %s
-; rdar://6808032
-
-; CHECK: pextrw $14
-; CHECK-NEXT: shrl $8
-; CHECK-NEXT: pinsrw
-
-define void @update(i8** %args_list) nounwind {
-entry:
-	%cmp.i = icmp eq i32 0, 0		; <i1> [#uses=1]
-	br i1 %cmp.i, label %if.then.i, label %test_cl.exit
-
-if.then.i:		; preds = %entry
-	%val = load <16 x i8> addrspace(1)* null		; <<16 x i8>> [#uses=8]
-	%tmp10.i = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 undef, i8 0, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef>, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 4, i32 undef, i32 6, i32 undef, i32 29, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp17.i = shufflevector <16 x i8> %tmp10.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 18, i32 4, i32 undef, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp24.i = shufflevector <16 x i8> %tmp17.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 24, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp31.i = shufflevector <16 x i8> %tmp24.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 21, i32 undef, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp38.i = shufflevector <16 x i8> %tmp31.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 27, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 13, i32 undef, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp45.i = shufflevector <16 x i8> %tmp38.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 13, i32 29, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp52.i = shufflevector <16 x i8> %tmp45.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 21, i32 10, i32 11, i32 12, i32 13, i32 14, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp59.i = shufflevector <16 x i8> %tmp52.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 20>		; <<16 x i8>> [#uses=1]
-	store <16 x i8> %tmp59.i, <16 x i8> addrspace(1)* null
-	ret void
-
-test_cl.exit:		; preds = %entry
-	ret void
-}
diff --git a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
index 50c62df..ffbe02c 100644
--- a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
+++ b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 > %t1
-; RUN: grep movzwl %t1 | count 2
-; RUN: grep movzbl %t1 | count 1
-; RUN: grep movd %t1 | count 4
+; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
 
 define <4 x i16> @a(i32* %x1) nounwind {
+; CHECK-LABEL: a:
+; CHECK:         shrl %[[R:[^,]+]]
+; CHECK-NEXT:    movd %[[R]], %xmm0
+; CHECK-NEXT:    retl
+
   %x2 = load i32* %x1
   %x3 = lshr i32 %x2, 1
   %x = trunc i32 %x3 to i16
@@ -12,6 +14,12 @@ define <4 x i16> @a(i32* %x1) nounwind {
 }
 
 define <8 x i16> @b(i32* %x1) nounwind {
+; CHECK-LABEL: b:
+; CHECK:         shrl %e[[R:.]]x
+; CHECK-NEXT:    movzwl %[[R]]x, %e[[R]]x
+; CHECK-NEXT:    movd %e[[R]]x, %xmm0
+; CHECK-NEXT:    retl
+
   %x2 = load i32* %x1
   %x3 = lshr i32 %x2, 1
   %x = trunc i32 %x3 to i16
@@ -20,6 +28,12 @@ define <8 x i16> @b(i32* %x1) nounwind {
 }
 
 define <8 x i8> @c(i32* %x1) nounwind {
+; CHECK-LABEL: c:
+; CHECK:         shrl %e[[R:.]]x
+; CHECK-NEXT:    movzwl %[[R]]x, %e[[R]]x
+; CHECK-NEXT:    movd %e[[R]]x, %xmm0
+; CHECK-NEXT:    retl
+
   %x2 = load i32* %x1
   %x3 = lshr i32 %x2, 1
   %x = trunc i32 %x3 to i8
@@ -28,6 +42,12 @@ define <8 x i8> @c(i32* %x1) nounwind {
 }
 
 define <16 x i8> @d(i32* %x1) nounwind {
+; CHECK-LABEL: d:
+; CHECK:         shrl %e[[R:.]]x
+; CHECK-NEXT:    movzbl %[[R]]l, %e[[R]]x
+; CHECK-NEXT:    movd %e[[R]]x, %xmm0
+; CHECK-NEXT:    retl
+
   %x2 = load i32* %x1
   %x3 = lshr i32 %x2, 1
   %x = trunc i32 %x3 to i8
diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll
index a936edc..6fe2ee4 100644
--- a/test/CodeGen/X86/2009-10-16-Scope.ll
+++ b/test/CodeGen/X86/2009-10-16-Scope.ll
@@ -9,7 +9,7 @@ entry:
   br label %do.body, !dbg !0
 
 do.body:                                          ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4)
+  call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4, metadata !{metadata !"0x102"})
   %conv = ptrtoint i32* %count_ to i32, !dbg !0   ; <i32> [#uses=1]
   %call = call i32 @foo(i32 %conv) ssp, !dbg !0   ; <i32> [#uses=0]
   br label %do.end, !dbg !0
@@ -18,17 +18,17 @@ do.end:                                           ; preds = %do.body
   ret void, !dbg !7
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i32 @foo(i32) ssp
 
 !0 = metadata !{i32 5, i32 2, metadata !1, null}
-!1 = metadata !{i32 458763, null, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!3 = metadata !{i32 458769, metadata !8, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, null, metadata !9, null, null, null, metadata !""}; [DW_TAG_compile_unit ]
-!4 = metadata !{i32 459008, metadata !5, metadata !"count_", metadata !3, i32 5, metadata !6}; [ DW_TAG_auto_variable ]
-!5 = metadata !{i32 458763, null, metadata !1, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!6 = metadata !{i32 458788, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}; [DW_TAG_base_type ]
+!1 = metadata !{metadata !"0xb\001\001\000", null, metadata !2}; [DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", i32 0, metadata !3, null, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = metadata !{metadata !"0x11\0012\00clang 1.1\001\00\000\00\000", metadata !8, null, metadata !9, null, null, null}; [DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x100\00count_\005\000", metadata !5, metadata !3, metadata !6}; [ DW_TAG_auto_variable ]
+!5 = metadata !{metadata !"0xb\001\001\000", null, metadata !1}; [DW_TAG_lexical_block ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3}; [DW_TAG_base_type ]
 !7 = metadata !{i32 6, i32 1, metadata !2, null}
 !8 = metadata !{metadata !"genmodes.i", metadata !"/Users/yash/Downloads"}
 !9 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index f99e682..0e2ed9d 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -12,7 +12,7 @@ entry:
   %retval = alloca double                         ; <double*> [#uses=2]
   %0 = alloca double                              ; <double*> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0), !dbg !15
+  call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0, metadata !{metadata !"0x102"}), !dbg !15
   %1 = getelementptr inbounds %struct.Rect* %my_r0, i32 0, i32 0, !dbg !16 ; <%struct.Pt*> [#uses=1]
   %2 = getelementptr inbounds %struct.Pt* %1, i32 0, i32 0, !dbg !16 ; <double*> [#uses=1]
   %3 = load double* %2, align 8, !dbg !16         ; <double> [#uses=1]
@@ -26,30 +26,30 @@ return:                                           ; preds = %entry
   ret double %retval1, !dbg !16
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"my_r0", metadata !2, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 11, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (%struct.Rect*)* @foo, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !19, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00my_r0\0011\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0011\000\001\000\006\000\000\0011", metadata !19, metadata !2, metadata !4, null, double (%struct.Rect*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !19, metadata !20, metadata !20, metadata !18, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !19, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !7}
-!6 = metadata !{i32 786468, metadata !19, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !19, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x13\00Rect\006\00256\0064\000\000\000", metadata !19, metadata !2, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !14}
-!9 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P1", i32 7, i64 128, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0xd\00P1\007\00128\0064\000\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0x13\00Pt\001\00128\0064\000\000\000", metadata !19, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
 !11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"x", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!13 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"y", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
-!14 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P2", i32 8, i64 128, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ]
+!12 = metadata !{metadata !"0xd\00x\002\0064\0064\000\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ]
+!13 = metadata !{metadata !"0xd\00y\003\0064\0064\0064\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ]
+!14 = metadata !{metadata !"0xd\00P2\008\00128\0064\00128\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ]
 !15 = metadata !{i32 11, i32 0, metadata !1, null}
 !16 = metadata !{i32 12, i32 0, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !19, metadata !1, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!17 = metadata !{metadata !"0xb\0011\000\000", metadata !19, metadata !1} ; [ DW_TAG_lexical_block ]
 !18 = metadata !{metadata !1}
 !19 = metadata !{metadata !"b2.c", metadata !"/tmp/"}
 !20 = metadata !{i32 0}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
index 4d4e8c1..a35efdc 100644
--- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
+++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
@@ -8,28 +8,28 @@
 
 define i32 @"main(tart.core.String[])->int32"(i32 %args) {
 entry:
-  tail call void @llvm.dbg.value(metadata !14, i64 0, metadata !8)
+  tail call void @llvm.dbg.value(metadata !14, i64 0, metadata !8, metadata !{metadata !"0x102"})
   tail call void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType* @.type.SwitchStmtTest) ; <%tart.core.Object*> [#uses=2]
   ret i32 3
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType*) nounwind readnone
 
-!0 = metadata !{i32 458769, metadata !15, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !16, metadata !16, null, null, null, i32 0} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 458790, metadata !15, metadata !0, metadata !"", i32 0, i64 192, i64 64, i64 0, i32 0, metadata !2} ; [ DW_TAG_const_type ]
-!2 = metadata !{i32 458771, metadata !15, metadata !0, metadata !"C", i32 1, i64 192, i64 64, i64 0, i32 0, null, metadata !3, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ]
+!0 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !15, metadata !16, metadata !16, null, null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x26\00\000\00192\0064\000\000", metadata !15, metadata !0, metadata !2} ; [ DW_TAG_const_type ]
+!2 = metadata !{metadata !"0x13\00C\001\00192\0064\000\000\000", metadata !15, metadata !0, null, metadata !3, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ]
 !3 = metadata !{metadata !4, metadata !6, metadata !7}
-!4 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"x", i32 1, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
-!5 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"y", i32 1, i64 64, i64 64, i64 64, i32 0, metadata !5} ; [ DW_TAG_member ]
-!7 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"z", i32 1, i64 64, i64 64, i64 128, i32 0, metadata !5} ; [ DW_TAG_member ]
-!8 = metadata !{i32 459008, metadata !9, metadata !"t", metadata !0, i32 5, metadata !2} ; [ DW_TAG_auto_variable ]
-!9 = metadata !{i32 458763, null, metadata !10, i32 0, i32 0, i32 0}        ; [ DW_TAG_lexical_block ]
-!10 = metadata !{i32 458798, i32 0, metadata !0, metadata !"foo", metadata !"foo", metadata !"foo", i32 4, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 458773, metadata !15, metadata !0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0xd\00x\001\0064\0064\000\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ]
+!5 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !15, metadata !0} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0xd\00y\001\0064\0064\0064\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ]
+!7 = metadata !{metadata !"0xd\00z\001\0064\0064\00128\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ]
+!8 = metadata !{metadata !"0x100\00t\005\000", metadata !9, metadata !0, metadata !2} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{metadata !"0xb\000\000\000", null, metadata !10}        ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0x2e\00foo\00foo\00foo\004\000\001\000\006\000\000\000", i32 0, metadata !0, metadata !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !15, metadata !0, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !15, metadata !0} ; [ DW_TAG_base_type ]
 !14 = metadata !{%tart.reflect.ComplexType* @.type.SwitchStmtTest}
 !15 = metadata !{metadata !"sm.c", metadata !""}
 !16 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
index 5372bc5..60025bf 100644
--- a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
+++ b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
@@ -7,7 +7,7 @@ entry:
         %tmp1 = bitcast double %a to <8 x i8>
         %tmp2 = bitcast double %b to <8 x i8>
         %tmp3 = add <8 x i8> %tmp1, %tmp2
-; CHECK:  paddw
+; CHECK:  paddb
         store <8 x i8> %tmp3, <8 x i8>* null
         ret void
 }
@@ -18,7 +18,7 @@ entry:
         %tmp1 = bitcast double %a to <4 x i16>
         %tmp2 = bitcast double %b to <4 x i16>
         %tmp3 = add <4 x i16> %tmp1, %tmp2
-; CHECK:  paddd
+; CHECK:  paddw
         store <4 x i16> %tmp3, <4 x i16>* null
         ret void
 }
@@ -29,7 +29,7 @@ entry:
         %tmp1 = bitcast double %a to <2 x i32>
         %tmp2 = bitcast double %b to <2 x i32>
         %tmp3 = add <2 x i32> %tmp1, %tmp2
-; CHECK:  paddq
+; CHECK:  paddd
         store <2 x i32> %tmp3, <2 x i32>* null
         ret void
 }
diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index 7faee99..1998011 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
@@ -2,8 +2,7 @@
 ; RUN: llc -mtriple=x86_64-pc-linux -O2 -regalloc=basic < %s | FileCheck %s
 ; Test to check .debug_loc support. This test case emits many debug_loc entries.
 
-; CHECK: Loc expr size
-; CHECK-NEXT: .short
+; CHECK: .short {{.*}} # Loc expr size
 ; CHECK-NEXT: .Ltmp
 ; CHECK-NEXT: DW_OP_reg
 
@@ -11,10 +10,10 @@
 
 define hidden %0 @__divsc3(float %a, float %b, float %c, float %d) nounwind readnone {
 entry:
-  tail call void @llvm.dbg.value(metadata !{float %a}, i64 0, metadata !0)
-  tail call void @llvm.dbg.value(metadata !{float %b}, i64 0, metadata !11)
-  tail call void @llvm.dbg.value(metadata !{float %c}, i64 0, metadata !12)
-  tail call void @llvm.dbg.value(metadata !{float %d}, i64 0, metadata !13)
+  tail call void @llvm.dbg.value(metadata !{float %a}, i64 0, metadata !0, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata !{float %b}, i64 0, metadata !11, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata !{float %c}, i64 0, metadata !12, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata !{float %d}, i64 0, metadata !13, metadata !{metadata !"0x102"})
   %0 = tail call float @fabsf(float %c) nounwind readnone, !dbg !19 ; <float> [#uses=1]
   %1 = tail call float @fabsf(float %d) nounwind readnone, !dbg !19 ; <float> [#uses=1]
   %2 = fcmp olt float %0, %1, !dbg !19            ; <i1> [#uses=1]
@@ -22,34 +21,34 @@ entry:
 
 bb:                                               ; preds = %entry
   %3 = fdiv float %c, %d, !dbg !20                ; <float> [#uses=3]
-  tail call void @llvm.dbg.value(metadata !{float %3}, i64 0, metadata !16), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{float %3}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !20
   %4 = fmul float %3, %c, !dbg !21                ; <float> [#uses=1]
   %5 = fadd float %4, %d, !dbg !21                ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %5}, i64 0, metadata !14), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{float %5}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !21
   %6 = fmul float %3, %a, !dbg !22                ; <float> [#uses=1]
   %7 = fadd float %6, %b, !dbg !22                ; <float> [#uses=1]
   %8 = fdiv float %7, %5, !dbg !22                ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %8}, i64 0, metadata !17), !dbg !22
+  tail call void @llvm.dbg.value(metadata !{float %8}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !22
   %9 = fmul float %3, %b, !dbg !23                ; <float> [#uses=1]
   %10 = fsub float %9, %a, !dbg !23               ; <float> [#uses=1]
   %11 = fdiv float %10, %5, !dbg !23              ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %11}, i64 0, metadata !18), !dbg !23
+  tail call void @llvm.dbg.value(metadata !{float %11}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !23
   br label %bb2, !dbg !23
 
 bb1:                                              ; preds = %entry
   %12 = fdiv float %d, %c, !dbg !24               ; <float> [#uses=3]
-  tail call void @llvm.dbg.value(metadata !{float %12}, i64 0, metadata !16), !dbg !24
+  tail call void @llvm.dbg.value(metadata !{float %12}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !24
   %13 = fmul float %12, %d, !dbg !25              ; <float> [#uses=1]
   %14 = fadd float %13, %c, !dbg !25              ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %14}, i64 0, metadata !14), !dbg !25
+  tail call void @llvm.dbg.value(metadata !{float %14}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !25
   %15 = fmul float %12, %b, !dbg !26              ; <float> [#uses=1]
   %16 = fadd float %15, %a, !dbg !26              ; <float> [#uses=1]
   %17 = fdiv float %16, %14, !dbg !26             ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %17}, i64 0, metadata !17), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{float %17}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !26
   %18 = fmul float %12, %a, !dbg !27              ; <float> [#uses=1]
   %19 = fsub float %b, %18, !dbg !27              ; <float> [#uses=1]
   %20 = fdiv float %19, %14, !dbg !27             ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %20}, i64 0, metadata !18), !dbg !27
+  tail call void @llvm.dbg.value(metadata !{float %20}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !27
   br label %bb2, !dbg !27
 
 bb2:                                              ; preds = %bb1, %bb
@@ -75,9 +74,9 @@ bb6:                                              ; preds = %bb4
 bb8:                                              ; preds = %bb6
   %27 = tail call float @copysignf(float 0x7FF0000000000000, float %c) nounwind readnone, !dbg !30 ; <float> [#uses=2]
   %28 = fmul float %27, %a, !dbg !30              ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %28}, i64 0, metadata !17), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{float %28}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !30
   %29 = fmul float %27, %b, !dbg !31              ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %29}, i64 0, metadata !18), !dbg !31
+  tail call void @llvm.dbg.value(metadata !{float %29}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !31
   br label %bb46, !dbg !31
 
 bb9:                                              ; preds = %bb6, %bb4
@@ -107,24 +106,24 @@ bb15:                                             ; preds = %bb14
 bb16:                                             ; preds = %bb15
   %iftmp.0.0 = select i1 %33, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
   %42 = tail call float @copysignf(float %iftmp.0.0, float %a) nounwind readnone, !dbg !33 ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %42}, i64 0, metadata !0), !dbg !33
+  tail call void @llvm.dbg.value(metadata !{float %42}, i64 0, metadata !0, metadata !{metadata !"0x102"}), !dbg !33
   %43 = fcmp ord float %b, 0.000000e+00           ; <i1> [#uses=1]
   %44 = fsub float %b, %b, !dbg !34               ; <float> [#uses=1]
   %45 = fcmp uno float %44, 0.000000e+00          ; <i1> [#uses=1]
   %46 = and i1 %43, %45, !dbg !34                 ; <i1> [#uses=1]
   %iftmp.1.0 = select i1 %46, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
   %47 = tail call float @copysignf(float %iftmp.1.0, float %b) nounwind readnone, !dbg !34 ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %47}, i64 0, metadata !11), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{float %47}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !34
   %48 = fmul float %42, %c, !dbg !35              ; <float> [#uses=1]
   %49 = fmul float %47, %d, !dbg !35              ; <float> [#uses=1]
   %50 = fadd float %48, %49, !dbg !35             ; <float> [#uses=1]
   %51 = fmul float %50, 0x7FF0000000000000, !dbg !35 ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %51}, i64 0, metadata !17), !dbg !35
+  tail call void @llvm.dbg.value(metadata !{float %51}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !35
   %52 = fmul float %47, %c, !dbg !36              ; <float> [#uses=1]
   %53 = fmul float %42, %d, !dbg !36              ; <float> [#uses=1]
   %54 = fsub float %52, %53, !dbg !36             ; <float> [#uses=1]
   %55 = fmul float %54, 0x7FF0000000000000, !dbg !36 ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %55}, i64 0, metadata !18), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{float %55}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !36
   br label %bb46, !dbg !36
 
 bb27:                                             ; preds = %bb15, %bb14, %bb11
@@ -155,24 +154,24 @@ bb34:                                             ; preds = %bb33, %bb30
 bb35:                                             ; preds = %bb34
   %iftmp.2.0 = select i1 %59, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
   %67 = tail call float @copysignf(float %iftmp.2.0, float %c) nounwind readnone, !dbg !38 ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %67}, i64 0, metadata !12), !dbg !38
+  tail call void @llvm.dbg.value(metadata !{float %67}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !38
   %68 = fcmp ord float %d, 0.000000e+00           ; <i1> [#uses=1]
   %69 = fsub float %d, %d, !dbg !39               ; <float> [#uses=1]
   %70 = fcmp uno float %69, 0.000000e+00          ; <i1> [#uses=1]
   %71 = and i1 %68, %70, !dbg !39                 ; <i1> [#uses=1]
   %iftmp.3.0 = select i1 %71, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
   %72 = tail call float @copysignf(float %iftmp.3.0, float %d) nounwind readnone, !dbg !39 ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %72}, i64 0, metadata !13), !dbg !39
+  tail call void @llvm.dbg.value(metadata !{float %72}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !39
   %73 = fmul float %67, %a, !dbg !40              ; <float> [#uses=1]
   %74 = fmul float %72, %b, !dbg !40              ; <float> [#uses=1]
   %75 = fadd float %73, %74, !dbg !40             ; <float> [#uses=1]
   %76 = fmul float %75, 0.000000e+00, !dbg !40    ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %76}, i64 0, metadata !17), !dbg !40
+  tail call void @llvm.dbg.value(metadata !{float %76}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !40
   %77 = fmul float %67, %b, !dbg !41              ; <float> [#uses=1]
   %78 = fmul float %72, %a, !dbg !41              ; <float> [#uses=1]
   %79 = fsub float %77, %78, !dbg !41             ; <float> [#uses=1]
   %80 = fmul float %79, 0.000000e+00, !dbg !41    ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %80}, i64 0, metadata !18), !dbg !41
+  tail call void @llvm.dbg.value(metadata !{float %80}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !41
   br label %bb46, !dbg !41
 
 bb46:                                             ; preds = %bb35, %bb34, %bb33, %bb30, %bb16, %bb8, %bb2
@@ -196,30 +195,30 @@ declare float @fabsf(float)
 
 declare float @copysignf(float, float) nounwind readnone
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!48}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !45, metadata !2, metadata !"__divsc3", metadata !"__divsc3", metadata !"__divsc3", i32 1922, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, %0 (float, float, float, float)* @__divsc3, null, null, metadata !43, i32 1922} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !45} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !45, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !47, metadata !47, metadata !44, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !45, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00a\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00__divsc3\00__divsc3\00__divsc3\001922\000\001\000\006\000\001\001922", metadata !45, metadata !2, metadata !4, null, %0 (float, float, float, float)* @__divsc3, null, null, metadata !43} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !45} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !45, metadata !47, metadata !47, metadata !44, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !45, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !9, metadata !9, metadata !9, metadata !9}
-!6 = metadata !{i32 786454, metadata !46, metadata !7, metadata !"SCtype", i32 170, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ]
-!7 = metadata !{i32 786473, metadata !46} ; [ DW_TAG_file_type ]
-!8 = metadata !{i32 786468, metadata !45, metadata !2, metadata !"complex float", i32 0, i64 64, i64 32, i64 0, i32 0, i32 3} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786454, metadata !46, metadata !7, metadata !"SFtype", i32 167, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ]
-!10 = metadata !{i32 786468, metadata !45, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 786689, metadata !1, metadata !"b", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 786689, metadata !1, metadata !"d", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!14 = metadata !{i32 786688, metadata !15, metadata !"denom", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{i32 786443, metadata !45, metadata !1, i32 1922, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 786688, metadata !15, metadata !"ratio", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!17 = metadata !{i32 786688, metadata !15, metadata !"x", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{i32 786688, metadata !15, metadata !"y", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
+!6 = metadata !{metadata !"0x16\00SCtype\00170\000\000\000\000", metadata !46, metadata !7, metadata !8} ; [ DW_TAG_typedef ]
+!7 = metadata !{metadata !"0x29", metadata !46} ; [ DW_TAG_file_type ]
+!8 = metadata !{metadata !"0x24\00complex float\000\0064\0032\000\000\003", metadata !45, metadata !2} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x16\00SFtype\00167\000\000\000\000", metadata !46, metadata !7, metadata !10} ; [ DW_TAG_typedef ]
+!10 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", metadata !45, metadata !2} ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !"0x101\00b\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!12 = metadata !{metadata !"0x101\00c\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!13 = metadata !{metadata !"0x101\00d\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!14 = metadata !{metadata !"0x100\00denom\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{metadata !"0xb\001922\000\000", metadata !45, metadata !1} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x100\00ratio\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!17 = metadata !{metadata !"0x100\00x\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!18 = metadata !{metadata !"0x100\00y\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
 !19 = metadata !{i32 1929, i32 0, metadata !15, null}
 !20 = metadata !{i32 1931, i32 0, metadata !15, null}
 !21 = metadata !{i32 1932, i32 0, metadata !15, null}
@@ -249,4 +248,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !45 = metadata !{metadata !"libgcc2.c", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
 !46 = metadata !{metadata !"libgcc2.h", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
 !47 = metadata !{i32 0}
-!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index e11b538..09120a1 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-darwin10"
 
 define i8* @bar(%struct.a* %myvar) nounwind optsize noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.a* %myvar}, i64 0, metadata !8)
+  tail call void @llvm.dbg.value(metadata !{%struct.a* %myvar}, i64 0, metadata !8, metadata !{metadata !"0x102"})
   %0 = getelementptr inbounds %struct.a* %myvar, i64 0, i32 0, !dbg !28 ; <i32*> [#uses=1]
   %1 = load i32* %0, align 8, !dbg !28            ; <i32> [#uses=1]
   tail call void @foo(i32 %1) nounwind optsize noinline ssp, !dbg !28
@@ -19,41 +19,41 @@ entry:
 
 declare void @foo(i32) nounwind optsize noinline ssp
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!38}
 
-!0 = metadata !{i32 786484, i32 0, metadata !1, metadata !"ret", metadata !"ret", metadata !"", metadata !1, i32 7, metadata !3, i1 false, i1 true, null, null} ; [ DW_TAG_variable ]
-!1 = metadata !{i32 786473, metadata !36} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !36, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !37, metadata !37, metadata !32, metadata !31,  metadata !37, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!4 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !1, i32 12, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ]
-!5 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, void (i32)* @foo, null, null, metadata !33, i32 13} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x34\00ret\00ret\00\007\000\001", metadata !1, metadata !1, metadata !3, null, null} ; [ DW_TAG_variable ]
+!1 = metadata !{metadata !"0x29", metadata !36} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !36, metadata !37, metadata !37, metadata !32, metadata !31,  metadata !37} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !36, metadata !1} ; [ DW_TAG_base_type ]
+!4 = metadata !{metadata !"0x101\00x\0012\000", metadata !5, metadata !1, metadata !3} ; [ DW_TAG_arg_variable ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0013\000\001\000\006\000\001\0013", metadata !36, metadata !1, metadata !6, null, void (i32)* @foo, null, null, metadata !33} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !3}
-!8 = metadata !{i32 786689, metadata !9, metadata !"myvar", metadata !1, i32 17, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 17, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i8* (%struct.a*)* @bar, null, null, metadata !34, i32 17} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0x101\00myvar\0017\000", metadata !9, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x2e\00bar\00bar\00bar\0017\000\001\000\006\000\001\0017", metadata !36, metadata !1, metadata !10, null, i8* (%struct.a*)* @bar, null, null, metadata !34} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786451, metadata !36, metadata !1, metadata !"a", i32 2, i64 128, i64 64, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, null} ; [ DW_TAG_pointer_type ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !14} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{metadata !"0x13\00a\002\00128\0064\000\000\000", metadata !36, metadata !1, null, metadata !15, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ]
 !15 = metadata !{metadata !16, metadata !17}
-!16 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"c", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !3} ; [ DW_TAG_member ]
-!17 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"d", i32 4, i64 64, i64 64, i64 64, i32 0, metadata !13} ; [ DW_TAG_member ]
-!18 = metadata !{i32 786689, metadata !19, metadata !"argc", metadata !1, i32 22, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 22, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, metadata !35, i32 22} ; [ DW_TAG_subprogram ]
-!20 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0xd\00c\003\0032\0032\000\000", metadata !36, metadata !14, metadata !3} ; [ DW_TAG_member ]
+!17 = metadata !{metadata !"0xd\00d\004\0064\0064\0064\000", metadata !36, metadata !14, metadata !13} ; [ DW_TAG_member ]
+!18 = metadata !{metadata !"0x101\00argc\0022\000", metadata !19, metadata !1, metadata !3} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x2e\00main\00main\00main\0022\000\001\000\006\000\001\0022", metadata !36, metadata !1, metadata !20, null, null, null, null, metadata !35} ; [ DW_TAG_subprogram ]
+!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !21 = metadata !{metadata !3, metadata !3, metadata !22}
-!22 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ]
-!24 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!25 = metadata !{i32 786689, metadata !19, metadata !"argv", metadata !1, i32 22, metadata !22, i32 0, null} ; [ DW_TAG_arg_variable ]
-!26 = metadata !{i32 786688, metadata !27, metadata !"e", metadata !1, i32 23, metadata !14, i32 0, null} ; [ DW_TAG_auto_variable ]
-!27 = metadata !{i32 786443, metadata !36, metadata !19, i32 22, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !24} ; [ DW_TAG_pointer_type ]
+!24 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !36, metadata !1} ; [ DW_TAG_base_type ]
+!25 = metadata !{metadata !"0x101\00argv\0022\000", metadata !19, metadata !1, metadata !22} ; [ DW_TAG_arg_variable ]
+!26 = metadata !{metadata !"0x100\00e\0023\000", metadata !27, metadata !1, metadata !14} ; [ DW_TAG_auto_variable ]
+!27 = metadata !{metadata !"0xb\0022\000\000", metadata !36, metadata !19} ; [ DW_TAG_lexical_block ]
 !28 = metadata !{i32 18, i32 0, metadata !29, null}
-!29 = metadata !{i32 786443, metadata !36, metadata !9, i32 17, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
+!29 = metadata !{metadata !"0xb\0017\000\001", metadata !36, metadata !9} ; [ DW_TAG_lexical_block ]
 !30 = metadata !{i32 19, i32 0, metadata !29, null}
 !31 = metadata !{metadata !0}
 !32 = metadata !{metadata !5, metadata !9, metadata !19}
@@ -73,18 +73,22 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 
 ; CHECK: Ldebug_loc0:
-; CHECK-NEXT: .quad   Lfunc_begin0
-; CHECK-NEXT: .quad   [[LABEL]]
+; CHECK-NEXT: [[SET1:.*]] = Lfunc_begin0-Lfunc_begin0
+; CHECK-NEXT: .quad   [[SET1]]
+; CHECK-NEXT: [[SET2:.*]] = [[LABEL]]-Lfunc_begin0
+; CHECK-NEXT: .quad   [[SET2]]
 ; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}               ## Loc expr size
 ; CHECK-NEXT: .short  Lset{{.*}}
 ; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .byte   85
 ; CHECK-NEXT: Ltmp{{.*}}:
-; CHECK-NEXT: .quad   [[LABEL]]
-; CHECK-NEXT: .quad   [[CLOBBER]]
+; CHECK-NEXT: [[SET3:.*]] = [[LABEL]]-Lfunc_begin0
+; CHECK-NEXT: .quad   [[SET3]]
+; CHECK-NEXT: [[SET4:.*]] = [[CLOBBER]]-Lfunc_begin0
+; CHECK-NEXT: .quad   [[SET4]]
 ; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}               ## Loc expr size
 ; CHECK-NEXT: .short  Lset{{.*}}
 ; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .byte   83
 ; CHECK-NEXT: Ltmp{{.*}}:
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index 1114c8d..b0a4e8d 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -4,19 +4,19 @@
 
 define i32 @foo(i32 %y) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !0)
+  tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !0, metadata !{metadata !"0x102"})
   %0 = tail call i32 (...)* @zoo(i32 %y) nounwind, !dbg !9 ; <i32> [#uses=1]
   ret i32 %0, !dbg !9
 }
 
 declare i32 @zoo(...)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 define i32 @bar(i32 %x) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !7)
-  tail call void @llvm.dbg.value(metadata !11, i64 0, metadata !0) nounwind
+  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !7, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata !11, i64 0, metadata !0, metadata !{metadata !"0x102"}) nounwind
   %0 = tail call i32 (...)* @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1]
   %1 = add nsw i32 %0, %x, !dbg !13               ; <i32> [#uses=1]
   ret i32 %1, !dbg !13
@@ -25,21 +25,21 @@ entry:
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"y", metadata !2, i32 2, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @foo, null, null, metadata !15, i32 2} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !18, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00y\002\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\001\002", metadata !18, metadata !2, metadata !4, null, i32 (i32)* @foo, null, null, metadata !15} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !18, metadata !19, metadata !19, metadata !17, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !6}
-!6 = metadata !{i32 786468, metadata !18, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !2, i32 6, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"bar", metadata !"bar", metadata !"bar", i32 6, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @bar, null, null, metadata !16, i32 6} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !18, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x101\00x\006\000", metadata !8, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{metadata !"0x2e\00bar\00bar\00bar\006\000\001\000\006\000\001\006", metadata !18, metadata !2, metadata !4, null, i32 (i32)* @bar, null, null, metadata !16} ; [ DW_TAG_subprogram ]
 !9 = metadata !{i32 3, i32 0, metadata !10, null}
-!10 = metadata !{i32 786443, metadata !18, metadata !1, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0xb\002\000\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
 !11 = metadata !{i32 1}
 !12 = metadata !{i32 3, i32 0, metadata !10, metadata !13}
 !13 = metadata !{i32 7, i32 0, metadata !14, null}
-!14 = metadata !{i32 786443, metadata !18, metadata !8, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\006\000\000", metadata !18, metadata !8} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{metadata !0}
 !16 = metadata !{metadata !7}
 !17 = metadata !{metadata !1, metadata !8}
@@ -49,4 +49,4 @@ entry:
 ;CHECK: DEBUG_VALUE: bar:x <- E
 ;CHECK: Ltmp
 ;CHECK:	DEBUG_VALUE: foo:y <- 1{{$}}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index 4181c26..dea9162 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
@@ -10,51 +10,51 @@ target triple = "x86_64-apple-darwin10.2"
 define i32 @_ZN3foo3bazEi(%struct.foo* nocapture %this, i32 %x) nounwind readnone optsize noinline ssp align 2 {
 ;CHECK: DEBUG_VALUE: baz:this <- RDI{{$}}
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.foo* %this}, i64 0, metadata !15)
-  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !16)
+  tail call void @llvm.dbg.value(metadata !{%struct.foo* %this}, i64 0, metadata !15, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !16, metadata !{metadata !"0x102"})
   %0 = mul nsw i32 %x, 7, !dbg !29                ; <i32> [#uses=1]
   %1 = add nsw i32 %0, 1, !dbg !29                ; <i32> [#uses=1]
   ret i32 %1, !dbg !29
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!4}
 !llvm.module.flags = !{!34}
 !llvm.dbg.lv = !{!0, !14, !15, !16, !17, !24, !25, !28}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"this", metadata !3, i32 11, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEi", i32 11, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* null, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786451, metadata !31, metadata !3, metadata !"foo", i32 3, i64 32, i64 32, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ]
-!3 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ]
-!4 = metadata !{i32 786449, metadata !31, i32 4, metadata !"4.2.1 LLVM build", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !33, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x101\00this\0011\000", metadata !1, metadata !3, metadata !12} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEi\0011\000\001\000\006\000\001\0011", metadata !31, metadata !2, metadata !9, null, i32 (%struct.foo*, i32)* null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x13\00foo\003\0032\0032\000\000\000", metadata !31, metadata !3, null, metadata !5, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ]
+!3 = metadata !{metadata !"0x29", metadata !31} ; [ DW_TAG_file_type ]
+!4 = metadata !{metadata !"0x11\004\004.2.1 LLVM build\001\00\000\00\000", metadata !31, metadata !32, metadata !32, metadata !33, null, null} ; [ DW_TAG_compile_unit ]
 !5 = metadata !{metadata !6, metadata !1, metadata !8}
-!6 = metadata !{i32 786445, metadata !31, metadata !2, metadata !"y", i32 8, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ]
-!7 = metadata !{i32 786468, metadata !31, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"baz", metadata !"baz", metadata !"_ZN3foo3bazEi", i32 15, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null, i32 15} ; [ DW_TAG_subprogram ]
-!9 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0xd\00y\008\0032\0032\000\000", metadata !31, metadata !2, metadata !7} ; [ DW_TAG_member ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !31, metadata !3} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x2e\00baz\00baz\00_ZN3foo3bazEi\0015\000\001\000\006\000\001\0015", metadata !31, metadata !2, metadata !9, null, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null} ; [ DW_TAG_subprogram ]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !3, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{metadata !7, metadata !11, metadata !7}
-!11 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !2} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{i32 786470, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !13} ; [ DW_TAG_const_type ]
-!13 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !2} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !3, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 786689, metadata !8, metadata !"this", metadata !3, i32 15, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !3, i32 15, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 786689, metadata !18, metadata !"argc", metadata !3, i32 19, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{i32 786478, metadata !31, metadata !3, metadata !"main", metadata !"main", metadata !"main", i32 19, metadata !19, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, null, i32 19} ; [ DW_TAG_subprogram ]
-!19 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !31, metadata !3, metadata !2} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !31, metadata !3, metadata !13} ; [ DW_TAG_const_type ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !2} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{metadata !"0x101\00x\0011\000", metadata !1, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{metadata !"0x101\00this\0015\000", metadata !8, metadata !3, metadata !12} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{metadata !"0x101\00x\0015\000", metadata !8, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{metadata !"0x101\00argc\0019\000", metadata !18, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{metadata !"0x2e\00main\00main\00main\0019\000\001\000\006\000\001\0019", metadata !31, metadata !3, metadata !19, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !3, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !20 = metadata !{metadata !7, metadata !7, metadata !21}
-!21 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ]
-!22 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{i32 786468, metadata !31, metadata !3, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!24 = metadata !{i32 786689, metadata !18, metadata !"argv", metadata !3, i32 19, metadata !21, i32 0, null} ; [ DW_TAG_arg_variable ]
-!25 = metadata !{i32 786688, metadata !26, metadata !"a", metadata !3, i32 20, metadata !2, i32 0, null} ; [ DW_TAG_auto_variable ]
-!26 = metadata !{i32 786443, metadata !31, metadata !27, i32 19, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!27 = metadata !{i32 786443, metadata !31, metadata !18, i32 19, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!28 = metadata !{i32 786688, metadata !26, metadata !"b", metadata !3, i32 21, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
+!21 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !22} ; [ DW_TAG_pointer_type ]
+!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !31, metadata !3} ; [ DW_TAG_base_type ]
+!24 = metadata !{metadata !"0x101\00argv\0019\000", metadata !18, metadata !3, metadata !21} ; [ DW_TAG_arg_variable ]
+!25 = metadata !{metadata !"0x100\00a\0020\000", metadata !26, metadata !3, metadata !2} ; [ DW_TAG_auto_variable ]
+!26 = metadata !{metadata !"0xb\0019\000\000", metadata !31, metadata !27} ; [ DW_TAG_lexical_block ]
+!27 = metadata !{metadata !"0xb\0019\000\000", metadata !31, metadata !18} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0x100\00b\0021\000", metadata !26, metadata !3, metadata !7} ; [ DW_TAG_auto_variable ]
 !29 = metadata !{i32 16, i32 0, metadata !30, null}
-!30 = metadata !{i32 786443, metadata !31, metadata !8, i32 15, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!30 = metadata !{metadata !"0xb\0015\000\000", metadata !31, metadata !8} ; [ DW_TAG_lexical_block ]
 !31 = metadata !{metadata !"foo.cp", metadata !"/tmp/"}
 !32 = metadata !{i32 0}
 !33 = metadata !{metadata !1, metadata !8, metadata !18}
-!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
index b49aec3..9d65dc1 100644
--- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll
+++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
@@ -3,29 +3,29 @@
 @.str = private constant [4 x i8] c"one\00", align 1 ; <[4 x i8]*> [#uses=1]
 @.str1 = private constant [4 x i8] c"two\00", align 1 ; <[5 x i8]*> [#uses=1]
 @C.9.2167 = internal constant [2 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0)]
-!38 = metadata !{i32 524329, metadata !109} ; [ DW_TAG_file_type ]
-!39 = metadata !{i32 524305, metadata !109, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", i1 true, metadata !"", i32 0, metadata !108, metadata !108, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!46 = metadata !{i32 524303, metadata !109, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !47} ; [ DW_TAG_pointer_type ]
-!47 = metadata !{i32 524324, metadata !109, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!97 = metadata !{i32 524334, i32 0, metadata !39, metadata !"main", metadata !"main", metadata !"main", i32 73, metadata !98, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!98 = metadata !{i32 524309, metadata !109, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !99, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!38 = metadata !{metadata !"0x29", metadata !109} ; [ DW_TAG_file_type ]
+!39 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", metadata !109, metadata !108, metadata !108, null, null, null} ; [ DW_TAG_compile_unit ]
+!46 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !109, null, metadata !47} ; [ DW_TAG_pointer_type ]
+!47 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !109, null} ; [ DW_TAG_base_type ]
+!97 = metadata !{metadata !"0x2e\00main\00main\00main\0073\000\001\000\006\000\000\000", i32 0, metadata !39, metadata !98, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!98 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !109, null, null, metadata !99, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !99 = metadata !{metadata !100}
-!100 = metadata !{i32 524324, metadata !109, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!100 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !109, null} ; [ DW_TAG_base_type ]
 !101 = metadata !{[2 x i8*]* @C.9.2167}
-!102 = metadata !{i32 524544, metadata !103, metadata !"find_strings", metadata !38, i32 75, metadata !104, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!103 = metadata !{i32 524299, null, metadata !97, i32 73, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!104 = metadata !{i32 524289, metadata !109, null, metadata !"", i32 0, i64 85312, i64 64, i64 0, i32 0, metadata !46, metadata !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ]
+!102 = metadata !{metadata !"0x100\00find_strings\0075\000", metadata !103, metadata !38, metadata !104} ; [ DW_TAG_auto_variable ]
+!103 = metadata !{metadata !"0xb\0073\000\000", null, metadata !97} ; [ DW_TAG_lexical_block ]
+!104 = metadata !{metadata !"0x1\00\000\0085312\0064\000\000", metadata !109, null, metadata !46, metadata !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ]
 !105 = metadata !{metadata !106}
-!106 = metadata !{i32 524321, i64 0, i64 1333}    ; [ DW_TAG_subrange_type ]
+!106 = metadata !{metadata !"0x21\000\001333"}    ; [ DW_TAG_subrange_type ]
 !107 = metadata !{i32 73, i32 0, metadata !103, null}
 !108 = metadata !{i32 0}
 !109 = metadata !{metadata !"pbmsrch.c", metadata !"/Users/grawp/LLVM/test-suite/MultiSource/Benchmarks/MiBench/office-stringsearch"}
 
 define i32 @main() nounwind ssp {
 bb.nph:
-  tail call void @llvm.dbg.declare(metadata !101, metadata !102), !dbg !107
+  tail call void @llvm.dbg.declare(metadata !101, metadata !102, metadata !{metadata !"0x102"}), !dbg !107
   ret i32 0, !dbg !107
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index 09e34ef..a613939 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -6,8 +6,8 @@
 define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23), !dbg !24
-  call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25), !dbg !24
+  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !24
   %0 = icmp ne i32 %i, 0, !dbg !27                ; <i1> [#uses=1]
   br i1 %0, label %bb, label %bb1, !dbg !27
 
@@ -34,7 +34,7 @@ return:                                           ; preds = %bb2
 define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2 {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31), !dbg !34
+  call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !34
   %0 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 0, !dbg !34 ; <i8**> [#uses=1]
   store i8* null, i8** %0, align 8, !dbg !34
   %1 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 1, !dbg !34 ; <i32*> [#uses=1]
@@ -45,14 +45,14 @@ return:                                           ; preds = %entry
   ret void, !dbg !35
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @main() nounwind ssp {
 entry:
   %0 = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=3]
   %v = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=4]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38), !dbg !41
+  call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38, metadata !{metadata !"0x102"}), !dbg !41
   call void @_ZN4SValC1Ev(%struct.SVal* %v) nounwind, !dbg !41
   %1 = getelementptr inbounds %struct.SVal* %v, i32 0, i32 1, !dbg !42 ; <i32*> [#uses=1]
   store i32 1, i32* %1, align 8, !dbg !42
@@ -65,65 +65,65 @@ entry:
   %7 = load i32* %6, align 8, !dbg !43            ; <i32> [#uses=1]
   store i32 %7, i32* %5, align 8, !dbg !43
   %8 = call i32 @_Z3fooi4SVal(i32 2, %struct.SVal* noalias %0) nounwind, !dbg !43 ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44), !dbg !43
+  call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44, metadata !{metadata !"0x102"}), !dbg !43
   br label %return, !dbg !45
 
 return:                                           ; preds = %entry
   ret i32 0, !dbg !45
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!49}
 !46 = metadata !{metadata !16, metadata !17, metadata !20}
 
-!0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786451, metadata !47, metadata !2, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
-!2 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !47, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !48, metadata !48, metadata !46, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x2e\00SVal\00SVal\00\0011\000\000\000\006\000\000\0011", metadata !47, metadata !1, metadata !14, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x13\00SVal\001\00128\0064\000\000\000", metadata !47, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
+!2 = metadata !{metadata !"0x29", metadata !47} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\001", metadata !47, metadata !48, metadata !48, metadata !46, null,  null} ; [ DW_TAG_compile_unit ]
 !4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9}
-!5 = metadata !{i32 786445, metadata !47, metadata !1, metadata !"Data", i32 7, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!6 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 786445, metadata !47, metadata !1, metadata !"Kind", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ]
-!8 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 12, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 12} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0xd\00Data\007\0064\0064\000\000", metadata !47, metadata !1, metadata !6} ; [ DW_TAG_member ]
+!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !47, metadata !2, null} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0xd\00Kind\008\0032\0032\0064\000", metadata !47, metadata !1, metadata !8} ; [ DW_TAG_member ]
+!8 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !47, metadata !2} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00\0012\000\000\000\006\000\000\0012", metadata !47, metadata !1, metadata !10, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{null, metadata !12, metadata !13}
-!12 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !47, metadata !2} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !12}
-!16 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"_ZN4SValC1Ev", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3fooi4SVal", i32 16, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null, i32 16} ; [ DW_TAG_subprogram ]
-!18 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0x2e\00SVal\00SVal\00_ZN4SValC1Ev\0011\000\001\000\006\000\000\0011", metadata !47, metadata !1, metadata !14, null, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi4SVal\0016\000\001\000\006\000\000\0016", metadata !47, metadata !2, metadata !18, null, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null} ; [ DW_TAG_subprogram ]
+!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{metadata !13, metadata !13, metadata !1}
-!20 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"main", metadata !"main", metadata !"main", i32 23, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @main, null, null, null, i32 23} ; [ DW_TAG_subprogram ]
-!21 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = metadata !{metadata !"0x2e\00main\00main\00main\0023\000\001\000\006\000\000\0023", metadata !47, metadata !2, metadata !21, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !22 = metadata !{metadata !13}
-!23 = metadata !{i32 786689, metadata !17, metadata !"i", metadata !2, i32 16, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
+!23 = metadata !{metadata !"0x101\00i\0016\000", metadata !17, metadata !2, metadata !13} ; [ DW_TAG_arg_variable ]
 !24 = metadata !{i32 16, i32 0, metadata !17, null}
-!25 = metadata !{i32 786689, metadata !17, metadata !"location", metadata !2, i32 16, metadata !26, i32 0, null} ; [ DW_TAG_arg_variable ]
-!26 = metadata !{i32 786448, metadata !47, metadata !2, metadata !"SVal", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_reference_type ]
+!25 = metadata !{metadata !"0x101\00location\0016\000", metadata !17, metadata !2, metadata !26} ; [ DW_TAG_arg_variable ]
+!26 = metadata !{metadata !"0x10\00SVal\000\0064\0064\000\000", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_reference_type ]
 !27 = metadata !{i32 17, i32 0, metadata !28, null}
-!28 = metadata !{i32 786443, metadata !47, metadata !17, i32 16, i32 0, i32 2} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0xb\0016\000\002", metadata !47, metadata !17} ; [ DW_TAG_lexical_block ]
 !29 = metadata !{i32 18, i32 0, metadata !28, null}
 !30 = metadata !{i32 20, i32 0, metadata !28, null}
-!31 = metadata !{i32 786689, metadata !16, metadata !"this", metadata !2, i32 11, metadata !32, i32 0, null} ; [ DW_TAG_arg_variable ]
-!32 = metadata !{i32 786470, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !33} ; [ DW_TAG_const_type ]
-!33 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_pointer_type ]
+!31 = metadata !{metadata !"0x101\00this\0011\000", metadata !16, metadata !2, metadata !32} ; [ DW_TAG_arg_variable ]
+!32 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !47, metadata !2, metadata !33} ; [ DW_TAG_const_type ]
+!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ]
 !34 = metadata !{i32 11, i32 0, metadata !16, null}
 !35 = metadata !{i32 11, i32 0, metadata !36, null}
-!36 = metadata !{i32 786443, metadata !47, metadata !37, i32 11, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
-!37 = metadata !{i32 786443, metadata !47, metadata !16, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!38 = metadata !{i32 786688, metadata !39, metadata !"v", metadata !2, i32 24, metadata !1, i32 0, null} ; [ DW_TAG_auto_variable ]
-!39 = metadata !{i32 786443, metadata !47, metadata !40, i32 23, i32 0, i32 4} ; [ DW_TAG_lexical_block ]
-!40 = metadata !{i32 786443, metadata !47, metadata !20, i32 23, i32 0, i32 3} ; [ DW_TAG_lexical_block ]
+!36 = metadata !{metadata !"0xb\0011\000\001", metadata !47, metadata !37} ; [ DW_TAG_lexical_block ]
+!37 = metadata !{metadata !"0xb\0011\000\000", metadata !47, metadata !16} ; [ DW_TAG_lexical_block ]
+!38 = metadata !{metadata !"0x100\00v\0024\000", metadata !39, metadata !2, metadata !1} ; [ DW_TAG_auto_variable ]
+!39 = metadata !{metadata !"0xb\0023\000\004", metadata !47, metadata !40} ; [ DW_TAG_lexical_block ]
+!40 = metadata !{metadata !"0xb\0023\000\003", metadata !47, metadata !20} ; [ DW_TAG_lexical_block ]
 !41 = metadata !{i32 24, i32 0, metadata !39, null}
 !42 = metadata !{i32 25, i32 0, metadata !39, null}
 !43 = metadata !{i32 26, i32 0, metadata !39, null}
-!44 = metadata !{i32 786688, metadata !39, metadata !"k", metadata !2, i32 26, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ]
+!44 = metadata !{metadata !"0x100\00k\0026\000", metadata !39, metadata !2, metadata !13} ; [ DW_TAG_auto_variable ]
 !45 = metadata !{i32 27, i32 0, metadata !39, null}
 !47 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
 !48 = metadata !{i32 0}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index a65b632..f52e922 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
@@ -15,21 +15,21 @@ entry:
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!17}
 
-!0 = metadata !{i32 786478, metadata !14, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 53, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !15, i32 12, metadata !"clang version 2.9 (trunk 114084)", i1 false, metadata !"", i32 0, metadata !16, metadata !16, metadata !13, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !14, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0053\000\001\000\006\000\000\000", metadata !14, metadata !1, metadata !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !14} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 114084)\000\00\000\00\000", metadata !15, metadata !16, metadata !16, metadata !13, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !14, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, metadata !14, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !15, metadata !7, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786473, metadata !15} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !14, metadata !1} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", metadata !15, metadata !7, metadata !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
+!7 = metadata !{metadata !"0x29", metadata !15} ; [ DW_TAG_file_type ]
 !8 = metadata !{i32 53, i32 13, metadata !9, null}
-!9 = metadata !{i32 786443, metadata !14, metadata !0, i32 53, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
+!9 = metadata !{metadata !"0xb\0053\0011\000", metadata !14, metadata !0} ; [ DW_TAG_lexical_block ]
 !10 = metadata !{i32 4, i32 13, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !15, metadata !12, i32 4, i32 13, i32 2} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 786443, metadata !15, metadata !6, i32 4, i32 11, i32 1} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{metadata !"0xb\004\0013\002", metadata !15, metadata !12} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{metadata !"0xb\004\0011\001", metadata !15, metadata !6} ; [ DW_TAG_lexical_block ]
 !13 = metadata !{metadata !0, metadata !6}
 !14 = metadata !{metadata !"", metadata !"/private/tmp"}
 !15 = metadata !{metadata !"bug.c", metadata !"/private/tmp"}
 !16 = metadata !{i32 0}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
index 21ac7c9..53fb0af 100644
--- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll
+++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
@@ -9,32 +9,32 @@ target triple = "i386-apple-darwin11.0.0"
 define i32 @foo(%struct.bar* nocapture %i) nounwind readnone optsize noinline ssp {
 ; CHECK: TAG_formal_parameter
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.bar* %i}, i64 0, metadata !6), !dbg !12
+  tail call void @llvm.dbg.value(metadata !{%struct.bar* %i}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !12
   ret i32 1, !dbg !13
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!19}
 
-!0 = metadata !{i32 786478, metadata !17, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.bar*)* @foo, null, null, metadata !16, i32 3} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !17} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !17, i32 12, metadata !"clang version 2.9 (trunk 117922)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, metadata !15, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !17, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\00256\001\003", metadata !17, metadata !1, metadata !3, null, i32 (%struct.bar*)* @foo, null, null, metadata !16} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 117922)\001\00\000\00\000", metadata !17, metadata !18, metadata !18, metadata !15, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !17, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, metadata !17, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786689, metadata !0, metadata !"i", metadata !1, i32 3, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 786447, metadata !17, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 786451, metadata !17, metadata !1, metadata !"bar", i32 2, i64 64, i64 32, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !17, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x101\00i\003\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
+!7 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !17, metadata !1, metadata !8} ; [ DW_TAG_pointer_type ]
+!8 = metadata !{metadata !"0x13\00bar\002\0064\0032\000\000\000", metadata !17, metadata !1, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !11}
-!10 = metadata !{i32 786445, metadata !17,  metadata !1, metadata !"x", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
-!11 = metadata !{i32 786445, metadata !17, metadata !1, metadata !"y", i32 2, i64 32, i64 32, i64 32, i32 0, metadata !5} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0xd\00x\002\0032\0032\000\000", metadata !17,  metadata !1, metadata !5} ; [ DW_TAG_member ]
+!11 = metadata !{metadata !"0xd\00y\002\0032\0032\0032\000", metadata !17, metadata !1, metadata !5} ; [ DW_TAG_member ]
 !12 = metadata !{i32 3, i32 47, metadata !0, null}
 !13 = metadata !{i32 4, i32 2, metadata !14, null}
-!14 = metadata !{i32 786443, metadata !17, metadata !0, i32 3, i32 50, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\003\0050\000", metadata !17, metadata !0} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{metadata !0}
 !16 = metadata !{metadata !6}
 !17 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
 !18 = metadata !{i32 0}
-!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index 625a351..ac7fbf2 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -22,8 +22,8 @@ target triple = "x86_64-apple-darwin10.0.0"
 
 define i64 @gcd(i64 %a, i64 %b) nounwind readnone optsize noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !10), !dbg !18
-  tail call void @llvm.dbg.value(metadata !{i64 %b}, i64 0, metadata !11), !dbg !19
+  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{i64 %b}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !19
   br label %while.body, !dbg !20
 
 while.body:                                       ; preds = %while.body, %entry
@@ -34,14 +34,14 @@ while.body:                                       ; preds = %while.body, %entry
   br i1 %cmp, label %if.then, label %while.body, !dbg !23
 
 if.then:                                          ; preds = %while.body
-  tail call void @llvm.dbg.value(metadata !{i64 %rem}, i64 0, metadata !12), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{i64 %rem}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !21
   ret i64 %b.addr.0, !dbg !23
 }
 
 define i32 @main() nounwind optsize ssp {
 entry:
   %call = tail call i32 @rand() nounwind optsize, !dbg !24
-  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !14), !dbg !24
+  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !24
   %cmp = icmp ugt i32 %call, 21, !dbg !25
   br i1 %cmp, label %cond.true, label %cond.end, !dbg !25
 
@@ -51,7 +51,7 @@ cond.true:                                        ; preds = %entry
 
 cond.end:                                         ; preds = %entry, %cond.true
   %cond = phi i32 [ %call1, %cond.true ], [ %call, %entry ], !dbg !25
-  tail call void @llvm.dbg.value(metadata !{i32 %cond}, i64 0, metadata !17), !dbg !25
+  tail call void @llvm.dbg.value(metadata !{i32 %cond}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !25
   %conv = sext i32 %cond to i64, !dbg !26
   %conv5 = zext i32 %call to i64, !dbg !26
   %call6 = tail call i64 @gcd(i64 %conv, i64 %conv5) optsize, !dbg !26
@@ -71,36 +71,36 @@ declare i32 @rand() optsize
 
 declare i32 @printf(i8* nocapture, ...) nounwind optsize
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 declare i32 @puts(i8* nocapture) nounwind
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"gcd", metadata !"gcd", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i64 (i64, i64)* @gcd, null, null, metadata !29, i32 0} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd]
-!1 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !31, i32 12, metadata !"clang version 2.9 (trunk 124117)", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !28, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00gcd\00gcd\00\005\000\001\000\006\00256\001\000", metadata !31, metadata !1, metadata !3, null, i64 (i64, i64)* @gcd, null, null, metadata !29} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd]
+!1 = metadata !{metadata !"0x29", metadata !31} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 124117)\001\00\000\00\001", metadata !31, metadata !32, metadata !32, metadata !28, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !2, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 25, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @main, null, null, metadata !30, i32 0} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main]
-!7 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00main\00main\00\0025\000\001\000\006\000\001\000", metadata !31, metadata !1, metadata !7, null, i32 ()* @main, null, null, metadata !30} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{i32 786689, metadata !0, metadata !"b", metadata !1, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 786688, metadata !13, metadata !"c", metadata !1, i32 6, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!13 = metadata !{i32 786443, metadata !31, metadata !0, i32 5, i32 52, i32 0} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{i32 786688, metadata !15, metadata !"m", metadata !1, i32 26, metadata !16, i32 0, null} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{i32 786443, metadata !31, metadata !6, i32 25, i32 12, i32 2} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!17 = metadata !{i32 786688, metadata !15, metadata !"z_s", metadata !1, i32 27, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x101\00a\005\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{metadata !"0x101\00b\005\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!12 = metadata !{metadata !"0x100\00c\006\000", metadata !13, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!13 = metadata !{metadata !"0xb\005\0052\000", metadata !31, metadata !0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0x100\00m\0026\000", metadata !15, metadata !1, metadata !16} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{metadata !"0xb\0025\0012\002", metadata !31, metadata !6} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !2} ; [ DW_TAG_base_type ]
+!17 = metadata !{metadata !"0x100\00z_s\0027\000", metadata !15, metadata !1, metadata !9} ; [ DW_TAG_auto_variable ]
 !18 = metadata !{i32 5, i32 41, metadata !0, null}
 !19 = metadata !{i32 5, i32 49, metadata !0, null}
 !20 = metadata !{i32 7, i32 5, metadata !13, null}
 !21 = metadata !{i32 8, i32 9, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !31, metadata !13, i32 7, i32 14, i32 1} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xb\007\0014\001", metadata !31, metadata !13} ; [ DW_TAG_lexical_block ]
 !23 = metadata !{i32 9, i32 9, metadata !22, null}
 !24 = metadata !{i32 26, i32 38, metadata !15, null}
 !25 = metadata !{i32 27, i32 38, metadata !15, null}
@@ -111,4 +111,4 @@ declare i32 @puts(i8* nocapture) nounwind
 !30 = metadata !{metadata !14, metadata !17}
 !31 = metadata !{metadata !"rem_small.c", metadata !"/private/tmp"}
 !32 = metadata !{i32 0}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/2011-08-29-InitOrder.ll b/test/CodeGen/X86/2011-08-29-InitOrder.ll
index a95dcb5..b278ad6 100644
--- a/test/CodeGen/X86/2011-08-29-InitOrder.ll
+++ b/test/CodeGen/X86/2011-08-29-InitOrder.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck %s --check-prefix=CHECK-DEFAULT
+; RUN: llc < %s -mtriple=i386-linux-gnu -use-ctors | FileCheck %s --check-prefix=CHECK-DEFAULT
 ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s --check-prefix=CHECK-DARWIN
 ; PR5329
 
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 16706ae..6651af7 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -8,7 +8,7 @@
 ;CHECK: vpxor
 ;CHECK: vinserti128
 ;CHECK: vpshufd
-;CHECK: vpshufd
+;CHECK: vpbroadcastd
 ;CHECK: vmulps
 ;CHECK: vmulps
 ;CHECK: ret
diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
index 1c39c74..519c7ca 100644
--- a/test/CodeGen/X86/2012-07-15-broadcastfold.ll
+++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s
 
 declare x86_fastcallcc i64 @barrier()
 
diff --git a/test/CodeGen/X86/2012-10-02-DAGCycle.ll b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
index 8d914db..403d21a 100644
--- a/test/CodeGen/X86/2012-10-02-DAGCycle.ll
+++ b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s
-; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s
+; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s > /dev/null
+; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s > /dev/null
 
 ; rdar://12393897
 
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index 62ee1e1..1a5efda 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -12,11 +12,11 @@
 %struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] }
 %struct.bnode.1.28 = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode.1.28*, %struct.bnode.1.28* }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp {
 entry:
-  call void @llvm.dbg.declare(metadata !{%struct.hgstruct.2.29* %hg}, metadata !4)
+  call void @llvm.dbg.declare(metadata !{%struct.hgstruct.2.29* %hg}, metadata !4, metadata !{metadata !"0x102"})
   %type = getelementptr inbounds %struct.node.0.27* %p, i64 0, i32 0
   %0 = load i16* %type, align 2
   %cmp = icmp eq i16 %0, 1
@@ -33,16 +33,20 @@ return:                                           ; preds = %for.cond.preheader,
   ret i16 %retval.0
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", metadata !11, metadata !2, metadata !2, metadata !13, metadata !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
 !2 = metadata !{}
-!4 = metadata !{i32 786689, null, metadata !"hg", metadata !5, i32 67109589, metadata !6, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [hg] [line 725]
-!5 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ]
-!6 = metadata !{i32 786454, metadata !11, null, metadata !"hgstruct", i32 492, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 786451, metadata !11, null, metadata !"", i32 487, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x101\00hg\0067109589\000", null, metadata !5, metadata !6} ; [ DW_TAG_arg_variable ] [hg] [line 725]
+!5 = metadata !{metadata !"0x29", metadata !11} ; [ DW_TAG_file_type ]
+!6 = metadata !{metadata !"0x16\00hgstruct\00492\000\000\000\000", metadata !11, null, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x13\00\00487\00512\0064\000\000\000", metadata !11, null, null, null, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ]
 !11 = metadata !{metadata !"MultiSource/Benchmarks/Olden/bh/newbh.c", metadata !"MultiSource/Benchmarks/Olden/bh"}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!13 = metadata !{metadata !14}
+!14 = metadata !{metadata !"0x2e\00subdivp\00subdivp\00\000\000\001\000\006\00256\001\001", metadata !11, metadata !5, metadata !15, null, i16 (%struct.node.0.27*, double, double, %struct.hgstruct.2.29* )* @subdivp, null, null, null} ; [ DW_TAG_subprogram ] [def] [subdivp]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{null}
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index 36667de..083aacd 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -12,7 +12,7 @@
 
 @.str15 = external hidden unnamed_addr constant [6 x i8], align 1
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp {
 entry:
@@ -43,7 +43,7 @@ if.then3344:
   br label %if.then4073
 
 if.then4073:                                      ; preds = %if.then3344
-  call void @llvm.dbg.declare(metadata !{[20 x i8]* %num14075}, metadata !4)
+  call void @llvm.dbg.declare(metadata !{[20 x i8]* %num14075}, metadata !4, metadata !{metadata !"0x102"})
   %arraydecay4078 = getelementptr inbounds [20 x i8]* %num14075, i64 0, i64 0
   %0 = load i32* undef, align 4
   %add4093 = add nsw i32 %0, 0
@@ -65,26 +65,31 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!35}
 
-!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", metadata !19, metadata !2, metadata !2, metadata !20, metadata !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
 !1 = metadata !{metadata !2}
 !2 = metadata !{}
-!4 = metadata !{i32 786688, metadata !5, metadata !"num1", metadata !14, i32 815, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [num1] [line 815]
-!5 = metadata !{i32 786443, metadata !14, metadata !6, i32 815, i32 0, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!6 = metadata !{i32 786443, metadata !14, metadata !7, i32 812, i32 0, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!7 = metadata !{i32 786443, metadata !14, metadata !8, i32 807, i32 0, i32 175} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!8 = metadata !{i32 786443, metadata !14, metadata !9, i32 440, i32 0, i32 94} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!9 = metadata !{i32 786443, metadata !14, metadata !10, i32 435, i32 0, i32 91} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!10 = metadata !{i32 786443, metadata !14, metadata !11, i32 434, i32 0, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!11 = metadata !{i32 786443, metadata !14, metadata !12, i32 250, i32 0, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!12 = metadata !{i32 786443, metadata !14, metadata !13, i32 249, i32 0, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!13 = metadata !{i32 786443, metadata !14, metadata !2, i32 221, i32 0, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!14 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
-!16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!4 = metadata !{metadata !"0x100\00num1\00815\000", metadata !5, metadata !14, metadata !15} ; [ DW_TAG_auto_variable ] [num1] [line 815]
+!5 = metadata !{metadata !"0xb\00815\000\00177", metadata !14, metadata !6} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!6 = metadata !{metadata !"0xb\00812\000\00176", metadata !14, metadata !7} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!7 = metadata !{metadata !"0xb\00807\000\00175", metadata !14, metadata !8} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!8 = metadata !{metadata !"0xb\00440\000\0094", metadata !14, metadata !9} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!9 = metadata !{metadata !"0xb\00435\000\0091", metadata !14, metadata !10} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!10 = metadata !{metadata !"0xb\00434\000\0090", metadata !14, metadata !11} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!11 = metadata !{metadata !"0xb\00250\000\0024", metadata !14, metadata !12} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!12 = metadata !{metadata !"0xb\00249\000\0023", metadata !14, metadata !13} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!13 = metadata !{metadata !"0xb\00221\000\0019", metadata !14, metadata !2} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!14 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
+!15 = metadata !{metadata !"0x1\00\000\00160\008\000\000", null, null, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!16 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !17 = metadata !{metadata !18}
-!18 = metadata !{i32 786465, i64 0, i64 20}       ; [ DW_TAG_subrange_type ] [0, 19]
+!18 = metadata !{metadata !"0x21\000\0020"}       ; [ DW_TAG_subrange_type ] [0, 19]
 !19 = metadata !{metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset"}
 
+!20 = metadata !{metadata !21}
+!21 = metadata !{metadata !"0x2e\00AttachGalley\00AttachGalley\00\000\000\001\000\006\00256\001\001", metadata !19, metadata !14, metadata !22, null, i32 (%union.rec**)* @AttachGalley, null, null, null} ; [ DW_TAG_subprogram ] [def] [AttachGalley]
+!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{null}
+
 ; Test DebugValue uses visited by RegisterPressureTracker findUseBetween().
 ;
 ; CHECK: @main
@@ -103,7 +108,7 @@ cond.true:                                        ; preds = %entry
   unreachable
 
 cond.end:                                         ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{%"class.__gnu_cxx::hash_map"* %X}, metadata !31)
+  call void @llvm.dbg.declare(metadata !{%"class.__gnu_cxx::hash_map"* %X}, metadata !31, metadata !{metadata !"0x102"})
   %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5
   invoke void @_Znwm()
           to label %exit.i unwind label %lpad2.i.i.i.i
@@ -129,9 +134,11 @@ declare void @_Znwm()
 
 !llvm.dbg.cu = !{!30}
 
-!30 = metadata !{i32 786449, metadata !34, i32 4, metadata !"clang version 3.3 (trunk 169129) (llvm/trunk 169135)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus]
-!31 = metadata !{i32 786688, null, metadata !"X", null, i32 29, metadata !32, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [X] [line 29]
-!32 = metadata !{i32 786454, metadata !34, null, metadata !"HM", i32 28, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ]
-!33 = metadata !{i32 786473, metadata !34} ; [ DW_TAG_file_type ]
+!30 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 169129) (llvm/trunk 169135)\001\00\000\00\000", metadata !34, metadata !2, metadata !2, metadata !36, null, null} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus]
+!31 = metadata !{metadata !"0x100\00X\0029\000", null, null, metadata !32} ; [ DW_TAG_auto_variable ] [X] [line 29]
+!32 = metadata !{metadata !"0x16\00HM\0028\000\000\000\000", metadata !34, null, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ]
+!33 = metadata !{metadata !"0x29", metadata !34} ; [ DW_TAG_file_type ]
 !34 = metadata !{metadata !"SingleSource/Benchmarks/Shootout-C++/hash.cpp", metadata !"SingleSource/Benchmarks/Shootout-C++"}
-!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!36 = metadata !{metadata !37}
+!37 = metadata !{metadata !"0x2e\00main\00main\00\000\000\001\000\006\00256\001\001", metadata !19, metadata !14, metadata !22, null, void ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [def] [main]
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
index 5aec3d9..458ce4f 100644
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
@@ -9,7 +9,7 @@
 
 %struct.btCompoundLeafCallback = type { i32, i32 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define void @test() unnamed_addr uwtable ssp align 2 {
 entry:
@@ -20,7 +20,7 @@ if.then:                                          ; preds = %entry
   unreachable
 
 if.end:                                           ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{%struct.btCompoundLeafCallback* %callback}, metadata !3)
+  call void @llvm.dbg.declare(metadata !{%struct.btCompoundLeafCallback* %callback}, metadata !3, metadata !{metadata !"0x102"})
   %m = getelementptr inbounds %struct.btCompoundLeafCallback* %callback, i64 0, i32 1
   store i32 0, i32* undef, align 8
   %cmp12447 = icmp sgt i32 undef, 0
@@ -36,11 +36,13 @@ invoke.cont44:                                    ; preds = %if.end
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8}
 
-!0 = metadata !{i32 786449, metadata !6, i32 4, metadata !"clang version 3.3 (trunk 168984) (llvm/trunk 168983)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus]
-!2 = metadata !{null}
-!3 = metadata !{i32 786688, null, metadata !"callback", null, i32 214, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [callback] [line 214]
-!4 = metadata !{i32 786451, metadata !6, null, metadata !"btCompoundLeafCallback", i32 90, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ]
-!5 = metadata !{i32 786473, metadata !6} ; [ DW_TAG_file_type ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 168984) (llvm/trunk 168983)\001\00\000\00\000", metadata !6, null, null, metadata !1, null, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{metadata !"0x2e\00test\00test\00\000\000\001\000\006\00256\001\001", metadata !6, metadata !5, metadata !7, null, void ()* @test, null, null, null} ; [ DW_TAG_subprogram ] [def] [test]
+!3 = metadata !{metadata !"0x100\00callback\00214\000", null, null, metadata !4} ; [ DW_TAG_auto_variable ] [callback] [line 214]
+!4 = metadata !{metadata !"0x13\00btCompoundLeafCallback\0090\00512\0064\000\000\000", metadata !6, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x29", metadata !6} ; [ DW_TAG_file_type ]
 !6 = metadata !{metadata !"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", metadata !"MultiSource/Benchmarks/Bullet"}
-!7 = metadata !{i32 0}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!9 = metadata !{null}
diff --git a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
index bbba796..10dc927 100644
--- a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
+++ b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
@@ -6,7 +6,7 @@
 ; we may reference variables that were not live across basic blocks
 ; resulting in undefined virtual registers.
 ;
-; In this example, this is illustrated by a the spill/reload of the
+; In this example, this is illustrated by a spill/reload of the
 ; LOADED_PTR_SLOT.
 ;
 ; Before this patch, the compiler was accessing two different spill
diff --git a/test/CodeGen/X86/2014-08-29-CompactUnwind.ll b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll
new file mode 100644
index 0000000..f65d7c9
--- /dev/null
+++ b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 -filetype=obj -o - | llvm-objdump -d -unwind-info -s - | FileCheck %s
+; Regression test for http://llvm.org/bugs/show_bug.cgi?id=20800.
+
+; ModuleID = 'asan_report.ii'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+@.str = private unnamed_addr constant [3 x i8] c"=>\00", align 1
+@.str1 = private unnamed_addr constant [3 x i8] c"  \00", align 1
+@.str2 = private unnamed_addr constant [6 x i8] c"%s%p:\00", align 1
+
+; CHECK: ___asan_report_error:
+
+; subq instruction starts at 0x0a, so the second byte of the compact encoding
+; (UNWIND_X86_64_FRAMELESS_STACK_SIZE in mach-o/compact_unwind_encoding.h)
+; must be 0x0d.
+; CHECK: {{a:.*subq.*%rsp}}
+
+; CHECK: Contents of __compact_unwind section
+; CHECK: ___asan_report_error
+
+; Because of incorrect push instruction size in X86AsmBackend.cpp the stack
+; size was also calculated incorrectly.
+; CHECK-NOT: {{compact encoding:.*0x0309f800}}
+; CHECK: {{compact encoding:.*0x030df800}}
+
+define void @__asan_report_error() #0 {
+  %str.i = alloca i64, align 8
+  %stack = alloca [256 x i64], align 8
+  br label %print_shadow_bytes.exit.i
+
+print_shadow_bytes.exit.i: ; preds = %print_shadow_bytes.exit.i, %0
+  %iv.i = phi i64 [ -5, %0 ], [ %iv.next.i, %print_shadow_bytes.exit.i ]
+  %reg15 = icmp eq i64 %iv.i, 0
+  %.str..str1.i = select i1 %reg15, [3 x i8]* @.str, [3 x i8]* @.str1
+  %reg16 = getelementptr inbounds [3 x i8]* %.str..str1.i, i64 0, i64 0
+  %reg17 = shl i64 %iv.i, 1
+  %reg19 = inttoptr i64 %reg17 to i8*
+  call void (i64*, i8*, ...)* @append(i64* %str.i, i8* getelementptr inbounds ([6 x i8]* @.str2, i64 0, i64 0), i8* %reg16, i8* %reg19)
+  %iv.next.i = add nsw i64 %iv.i, 0
+  br label %print_shadow_bytes.exit.i
+}
+
+declare void @append(i64*, i8*, ...)
+
+attributes #0 = { "no-frame-pointer-elim"="false" }
diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index 4ce2fb3..54d8f65 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll
@@ -4,10 +4,10 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.7.0"
 
 define i32 @foo(i32 %i, i32* nocapture %c) nounwind uwtable readonly ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6), !dbg !12
+  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !12
   %ab = load i32* %c, align 1, !dbg !14
-  tail call void @llvm.dbg.value(metadata !{i32* %c}, i64 0, metadata !7), !dbg !13
-  tail call void @llvm.dbg.value(metadata !{i32 %ab}, i64 0, metadata !10), !dbg !14
+  tail call void @llvm.dbg.value(metadata !{i32* %c}, i64 0, metadata !7, metadata !{metadata !"0x102"}), !dbg !13
+  tail call void @llvm.dbg.value(metadata !{i32 %ab}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !14
   %cd = icmp eq i32 %i, 42, !dbg !15
   br i1 %cd, label %bb1, label %bb2, !dbg !15
 
@@ -23,23 +23,23 @@ bb2:
   ret i32 %.0, !dbg !17
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, metadata !18, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i32*)* @foo, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
-!2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)\001\00\000\00\001", metadata !20, metadata !21, metadata !21, metadata !18, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\00256\001\000", metadata !20, metadata !2, metadata !3, null, i32 (i32, i32*)* @foo, null, null, metadata !19} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777218, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 33554434, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ]
-!9 = metadata !{i32 786468, null, metadata !0, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786688, metadata !11, metadata !"a", metadata !2, i32 3, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !20, metadata !1, i32 2, i32 25, i32 0} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x101\00i\0016777218\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!7 = metadata !{metadata !"0x101\00c\0033554434\000", metadata !1, metadata !2, metadata !8} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !0, metadata !9} ; [ DW_TAG_pointer_type ]
+!9 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !0} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x100\00a\003\000", metadata !11, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{metadata !"0xb\002\0025\000", metadata !20, metadata !1} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{i32 2, i32 13, metadata !1, null}
 !13 = metadata !{i32 2, i32 22, metadata !1, null}
 !14 = metadata !{i32 3, i32 14, metadata !11, null}
@@ -50,4 +50,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !19 = metadata !{metadata !6, metadata !7, metadata !10}
 !20 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
 !21 = metadata !{i32 0}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
index 51d0d17..6865873 100644
--- a/test/CodeGen/X86/StackColoring-dbg.ll
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -5,7 +5,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define void @foo() nounwind uwtable ssp {
 entry:
@@ -17,7 +17,7 @@ entry:
 for.body:
   call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
   call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind
-  call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22) nounwind
+  call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22, metadata !{metadata !"0x102"}) nounwind
   br label %for.body
 }
 
@@ -27,9 +27,9 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!23}
-!0 = metadata !{i32 524305, metadata !1, i32 1, metadata !"clang", i1 true, metadata !"", i32 0, metadata !2, metadata !2, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\001\00clang\001\00\000\00\000", metadata !1, metadata !2, metadata !2, null, null, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"t.c", metadata !""}
-!16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6}
+!16 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
 !2 = metadata !{i32 0}
-!22 = metadata !{i32 786688, null, metadata !"x", metadata !2, i32 16, metadata !16, i32 0, i32 0}
-!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{metadata !"0x100\00x\0016\000", null, metadata !2, metadata !16} ; [ DW_TAG_auto_variable ]
+!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/SwizzleShuff.ll b/test/CodeGen/X86/SwizzleShuff.ll
index 100817a..a435272 100644
--- a/test/CodeGen/X86/SwizzleShuff.ll
+++ b/test/CodeGen/X86/SwizzleShuff.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s
 
 ; Check that we perform a scalar XOR on i32.
 
diff --git a/test/CodeGen/X86/TruncAssertZext.ll b/test/CodeGen/X86/TruncAssertZext.ll
new file mode 100644
index 0000000..8c66412
--- /dev/null
+++ b/test/CodeGen/X86/TruncAssertZext.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -O2 -march=x86-64 | FileCheck %s
+; Checks that a zeroing mov is inserted for the trunc/zext pair even when
+; the source of the zext is an AssertSext node
+; PR20494
+
+define i64 @main(i64 %a) {
+; CHECK-LABEL: main
+; CHECK: movl %e{{..}}, %eax
+; CHECK: ret
+  %or = or i64 %a, -2
+  %trunc = trunc i64 %or to i32
+  br label %l
+l:
+  %ext = zext i32 %trunc to i64
+  ret i64 %ext
+}
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index 1513fcb..9c24be4 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -4,7 +4,7 @@
 define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: test1:
-; CHECK: cmpl %ecx, %eax
+; CHECK: cmpl %ecx, %eax 
 ; CHECK-NOT: addl
 ; CHECK: adcl $0, %eax
   %add4 = add i32 %x, %sum
diff --git a/test/CodeGen/X86/add_shl_constant.ll b/test/CodeGen/X86/add_shl_constant.ll
new file mode 100644
index 0000000..33074e4
--- /dev/null
+++ b/test/CodeGen/X86/add_shl_constant.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+; CHECK-LABEL: add_shl_add_constant_1_i32
+; CHECK: leal 984(%rsi,%rdi,8), %eax
+; CHECK-NEXT: retq
+define i32 @add_shl_add_constant_1_i32(i32 %x, i32 %y) nounwind {
+  %add.0 = add i32 %x, 123
+  %shl = shl i32 %add.0, 3
+  %add.1 = add i32 %shl, %y
+  ret i32 %add.1
+}
+
+; CHECK-LABEL: add_shl_add_constant_2_i32
+; CHECK: leal 984(%rsi,%rdi,8), %eax
+; CHECK-NEXT: retq
+define i32 @add_shl_add_constant_2_i32(i32 %x, i32 %y) nounwind {
+  %add.0 = add i32 %x, 123
+  %shl = shl i32 %add.0, 3
+  %add.1 = add i32 %y, %shl
+  ret i32 %add.1
+}
+
+; CHECK: LCPI2_0:
+; CHECK: .long 984
+; CHECK: _add_shl_add_constant_1_v4i32
+; CHECK: pslld $3, %[[REG:xmm[0-9]+]]
+; CHECK: paddd %xmm1, %[[REG]]
+; CHECK: paddd LCPI2_0(%rip), %[[REG:xmm[0-9]+]]
+; CHECK: retq
+define <4 x i32> @add_shl_add_constant_1_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+  %add.0 = add <4 x i32> %x, <i32 123, i32 123, i32 123, i32 123>
+  %shl = shl <4 x i32> %add.0, <i32 3, i32 3, i32 3, i32 3>
+  %add.1 = add <4 x i32> %shl, %y
+  ret <4 x i32> %add.1
+}
+
+; CHECK: LCPI3_0:
+; CHECK: .long 984
+; CHECK: _add_shl_add_constant_2_v4i32
+; CHECK: pslld $3, %[[REG:xmm[0-9]+]]
+; CHECK: paddd %xmm1, %[[REG]]
+; CHECK: paddd LCPI3_0(%rip), %[[REG:xmm[0-9]+]]
+; CHECK: retq
+define <4 x i32> @add_shl_add_constant_2_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+  %add.0 = add <4 x i32> %x, <i32 123, i32 123, i32 123, i32 123>
+  %shl = shl <4 x i32> %add.0, <i32 3, i32 3, i32 3, i32 3>
+  %add.1 = add <4 x i32> %y, %shl
+  ret <4 x i32> %add.1
+}
diff --git a/test/CodeGen/X86/addr-mode-matcher.ll b/test/CodeGen/X86/addr-mode-matcher.ll
new file mode 100644
index 0000000..d592091
--- /dev/null
+++ b/test/CodeGen/X86/addr-mode-matcher.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s | FileCheck %s
+
+; This testcase used to hit an assert during ISel.  For details, see the big
+; comment inside the function.
+
+; CHECK-LABEL: foo:
+; The AND should be turned into a subreg access.
+; CHECK-NOT: and
+; The shift (leal) should be folded into the scale of the address in the load.
+; CHECK-NOT: leal
+; CHECK: movl {{.*}},4),
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.6.0"
+
+define void @foo(i32 %a) {
+bb:
+  br label %bb1692
+
+bb1692:
+  %tmp1694 = phi i32 [ 0, %bb ], [ %tmp1745, %bb1692 ]
+  %xor = xor i32 0, %tmp1694
+
+; %load1 = (load (and (shl %xor, 2), 1020))
+  %tmp1701 = shl i32 %xor, 2
+  %tmp1702 = and i32 %tmp1701, 1020
+  %tmp1703 = getelementptr inbounds [1028 x i8]* null, i32 0, i32 %tmp1702
+  %tmp1704 = bitcast i8* %tmp1703 to i32*
+  %load1 = load i32* %tmp1704, align 4
+
+; %load2 = (load (shl (and %xor, 255), 2))
+  %tmp1698 = and i32 %xor, 255
+  %tmp1706 = shl i32 %tmp1698, 2
+  %tmp1707 = getelementptr inbounds [1028 x i8]* null, i32 0, i32 %tmp1706
+  %tmp1708 = bitcast i8* %tmp1707 to i32*
+  %load2 = load i32* %tmp1708, align 4
+
+  %tmp1710 = or i32 %load2, %a
+
+; While matching xor we address-match %load1.  The and-of-shift reassocication
+; in address matching transform this into into a shift-of-and and the resuting
+; node becomes identical to %load2.  CSE replaces %load1 which leaves its
+; references in MatchScope and RecordedNodes stale.
+  %tmp1711 = xor i32 %load1, %tmp1710
+
+  %tmp1744 = getelementptr inbounds [256 x i32]* null, i32 0, i32 %tmp1711
+  store i32 0, i32* %tmp1744, align 4
+  %tmp1745 = add i32 %tmp1694, 1
+  indirectbr i8* undef, [label %bb1756, label %bb1692]
+
+bb1756:
+  br label %bb2705
+
+bb2705:
+  indirectbr i8* undef, [label %bb5721, label %bb5736]
+
+bb5721:
+  br label %bb2705
+
+bb5736:
+  ret void
+}
diff --git a/test/CodeGen/X86/address-type-promotion-constantexpr.ll b/test/CodeGen/X86/address-type-promotion-constantexpr.ll
new file mode 100644
index 0000000..32f29bd
--- /dev/null
+++ b/test/CodeGen/X86/address-type-promotion-constantexpr.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux
+
+; PR20314 is a crashing bug. This program does nothing with the load, so just check that the return is 0.
+
+@c = common global [2 x i32] zeroinitializer, align 4
+@a = common global i32 0, align 4
+@b = internal unnamed_addr constant [2 x i8] c"\01\00", align 1
+
+; CHECK-LABEL: main
+; CHECK: xor %eax, %eax
+define i32 @main() {
+entry:
+  %foo = load i8* getelementptr ([2 x i8]* @b, i64 0, i64 sext (i8 or (i8 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32]* @c, i64 0, i64 1), i32* @a) to i8), i8 1) to i64)), align 1
+  ret i32 0
+}
+
diff --git a/test/CodeGen/X86/adx-intrinsics.ll b/test/CodeGen/X86/adx-intrinsics.ll
new file mode 100644
index 0000000..0498177
--- /dev/null
+++ b/test/CodeGen/X86/adx-intrinsics.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 --show-mc-encoding| FileCheck %s --check-prefix=NOADX --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=broadwell --show-mc-encoding| FileCheck %s --check-prefix=ADX --check-prefix=CHECK
+
+declare i8 @llvm.x86.addcarryx.u32(i8, i32, i32, i8*)
+
+define i8 @test_addcarryx_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarryx_u32
+; CHECK: addb
+; ADX: adcxl
+; CHECK: setb
+; CHECK: retq
+  %ret = tail call i8 @llvm.x86.addcarryx.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
+  ret i8 %ret;
+}
+
+declare i8 @llvm.x86.addcarryx.u64(i8, i64, i64, i8*)
+
+define i8 @test_addcarryx_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarryx_u64
+; CHECK: addb
+; ADX: adcxq
+; CHECK: setb
+; CHECK: retq
+  %ret = tail call i8 @llvm.x86.addcarryx.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
+  ret i8 %ret;
+}
+
+declare i8 @llvm.x86.addcarry.u32(i8, i32, i32, i8*)
+
+define i8 @test_addcarry_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarry_u32
+; CHECK: addb
+; ADX: adcxl
+; NOADX: adcl
+; CHECK: setb
+; CHECK: retq
+  %ret = tail call i8 @llvm.x86.addcarry.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
+  ret i8 %ret;
+}
+
+declare i8 @llvm.x86.addcarry.u64(i8, i64, i64, i8*)
+
+define i8 @test_addcarry_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarry_u64
+; CHECK: addb
+; ADX: adcxq
+; NOADX: adcq
+; CHECK: setb
+; CHECK: retq
+  %ret = tail call i8 @llvm.x86.addcarry.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
+  ret i8 %ret;
+}
+
+declare i8 @llvm.x86.subborrow.u32(i8, i32, i32, i8*)
+
+define i8 @test_subborrow_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
+; CHECK-LABEL: test_subborrow_u32
+; CHECK: addb
+; CHECK: sbbl
+; CHECK: setb
+; CHECK: retq
+  %ret = tail call i8 @llvm.x86.subborrow.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
+  ret i8 %ret;
+}
+
+declare i8 @llvm.x86.subborrow.u64(i8, i64, i64, i8*)
+
+define i8 @test_subborrow_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
+; CHECK-LABEL: test_subborrow_u64
+; CHECK: addb
+; CHECK: sbbq
+; CHECK: setb
+; CHECK: retq
+  %ret = tail call i8 @llvm.x86.subborrow.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
+  ret i8 %ret;
+}
+
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index bf55644..82a8e48 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -30,12 +30,12 @@ define i32 @foo_f() {
   ret i32 0
 }
 ; CHECK-DAG: .weak	bar_f
-@bar_f = alias weak %FunTy* @foo_f
+@bar_f = weak alias %FunTy* @foo_f
 
-@bar_l = alias linkonce_odr i32* @bar
+@bar_l = linkonce_odr alias i32* @bar
 ; CHECK-DAG: .weak	bar_l
 
-@bar_i = alias internal i32* @bar
+@bar_i = internal alias i32* @bar
 
 ; CHECK-DAG: .globl	A
 @A = alias bitcast (i32* @bar to i64*)
diff --git a/test/CodeGen/X86/aligned-variadic.ll b/test/CodeGen/X86/aligned-variadic.ll
new file mode 100644
index 0000000..e2155fe
--- /dev/null
+++ b/test/CodeGen/X86/aligned-variadic.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin   | FileCheck %s -check-prefix=X32
+
+%struct.Baz = type { [17 x i8] }
+%struct.__va_list_tag = type { i32, i32, i8*, i8* }
+
+; Function Attrs: nounwind uwtable
+define void @bar(%struct.Baz* byval nocapture readnone align 8 %x, ...) {
+entry:
+  %va = alloca [1 x %struct.__va_list_tag], align 16
+  %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag]* %va, i64 0, i64 0
+  %arraydecay1 = bitcast [1 x %struct.__va_list_tag]* %va to i8*
+  call void @llvm.va_start(i8* %arraydecay1)
+  %overflow_arg_area_p = getelementptr inbounds [1 x %struct.__va_list_tag]* %va, i64 0, i64 0, i32 2
+  %overflow_arg_area = load i8** %overflow_arg_area_p, align 8
+  %overflow_arg_area.next = getelementptr i8* %overflow_arg_area, i64 24
+  store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8
+; X32: leal    68(%esp), [[REG:%.*]]
+; X32: movl    [[REG]], 16(%esp)
+; X64: leaq    232(%rsp), [[REG:%.*]]
+; X64: movq    [[REG]], 184(%rsp)
+; X64: leaq    176(%rsp), %rdi
+  call void @qux(%struct.__va_list_tag* %arraydecay)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*)
+
+declare void @qux(%struct.__va_list_tag*)
diff --git a/test/CodeGen/X86/alloca-align-rounding.ll b/test/CodeGen/X86/alloca-align-rounding.ll
index 74b9470..9d8b6cf 100644
--- a/test/CodeGen/X86/alloca-align-rounding.ll
+++ b/test/CodeGen/X86/alloca-align-rounding.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux -enable-misched=false | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnux32 -enable-misched=false | FileCheck %s -check-prefix=X32ABI
 
 declare void @bar(<2 x i64>* %n)
 
@@ -6,15 +7,29 @@ define void @foo(i64 %h) {
   %p = alloca <2 x i64>, i64 %h
   call void @bar(<2 x i64>* %p)
   ret void
-; CHECK: foo
+; CHECK-LABEL: foo
 ; CHECK-NOT: andq $-32, %rax
+; X32ABI-LABEL: foo
+; X32ABI-NOT: andl $-32, %eax
 }
 
 define void @foo2(i64 %h) {
   %p = alloca <2 x i64>, i64 %h, align 32
   call void @bar(<2 x i64>* %p)
   ret void
-; CHECK: foo2
+; CHECK-LABEL: foo2
 ; CHECK: andq $-32, %rsp
 ; CHECK: andq $-32, %rax
+; X32ABI-LABEL: foo2
+; X32ABI: andl $-32, %esp
+; X32ABI: andl $-32, %eax
+}
+
+define void @foo3(i64 %h) {
+  %p = alloca <2 x i64>, i64 %h
+  ret void
+; CHECK-LABEL: foo3
+; CHECK: movq %rbp, %rsp
+; X32ABI-LABEL: foo3
+; X32ABI: movl %ebp, %esp
 }
diff --git a/test/CodeGen/X86/asm-block-labels.ll b/test/CodeGen/X86/asm-block-labels.ll
index 6dbfb16..9352438 100644
--- a/test/CodeGen/X86/asm-block-labels.ll
+++ b/test/CodeGen/X86/asm-block-labels.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts | llc -no-integrated-as
+; RUN: opt < %s -O3 | llc -no-integrated-as
 ; ModuleID = 'block12.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"
diff --git a/test/CodeGen/X86/atomic-load-store-wide.ll b/test/CodeGen/X86/atomic-load-store-wide.ll
index 7352d5a..ad1a5c6 100644
--- a/test/CodeGen/X86/atomic-load-store-wide.ll
+++ b/test/CodeGen/X86/atomic-load-store-wide.ll
@@ -4,16 +4,18 @@
 ; FIXME: The generated code can be substantially improved.
 
 define void @test1(i64* %ptr, i64 %val1) {
-; CHECK: test1
-; CHECK: cmpxchg8b
+; CHECK-LABEL: test1
+; CHECK: lock
+; CHECK-NEXT: cmpxchg8b
 ; CHECK-NEXT: jne
   store atomic i64 %val1, i64* %ptr seq_cst, align 8
   ret void
 }
 
 define i64 @test2(i64* %ptr) {
-; CHECK: test2
-; CHECK: cmpxchg8b
+; CHECK-LABEL: test2
+; CHECK: lock
+; CHECK-NEXT: cmpxchg8b
   %val = load atomic i64* %ptr seq_cst, align 8
   ret i64 %val
 }
diff --git a/test/CodeGen/X86/atomic-ops-ancient-64.ll b/test/CodeGen/X86/atomic-ops-ancient-64.ll
index 18749b9..508d83b 100644
--- a/test/CodeGen/X86/atomic-ops-ancient-64.ll
+++ b/test/CodeGen/X86/atomic-ops-ancient-64.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+; XFAIL: *
 
 define i64 @test_add(i64* %addr, i64 %inc) {
 ; CHECK-LABEL: test_add:
diff --git a/test/CodeGen/X86/atomic_add.ll b/test/CodeGen/X86/atomic_add.ll
index bdd25e6..f60212d 100644
--- a/test/CodeGen/X86/atomic_add.ll
+++ b/test/CodeGen/X86/atomic_add.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
 
 ; rdar://7103704
 
@@ -14,6 +15,8 @@ define void @inc4(i64* nocapture %p) nounwind ssp {
 entry:
 ; CHECK-LABEL: inc4:
 ; CHECK: incq
+; SLOW_INC-LABEL: inc4:
+; SLOW_INC-NOT: incq
   %0 = atomicrmw add i64* %p, i64 1 monotonic
   ret void
 }
@@ -39,6 +42,8 @@ define void @inc3(i8* nocapture %p) nounwind ssp {
 entry:
 ; CHECK-LABEL: inc3:
 ; CHECK: incb
+; SLOW_INC-LABEL: inc3:
+; SLOW_INC-NOT: incb
   %0 = atomicrmw add i8* %p, i8 1 monotonic
   ret void
 }
@@ -64,6 +69,8 @@ define void @inc2(i16* nocapture %p) nounwind ssp {
 entry:
 ; CHECK-LABEL: inc2:
 ; CHECK: incw
+; SLOW_INC-LABEL: inc2:
+; SLOW_INC-NOT: incw
   %0 = atomicrmw add i16* %p, i16 1 monotonic
   ret void
 }
@@ -89,6 +96,8 @@ define void @inc1(i32* nocapture %p) nounwind ssp {
 entry:
 ; CHECK-LABEL: inc1:
 ; CHECK: incl
+; SLOW_INC-LABEL: inc1:
+; SLOW_INC-NOT: incl
   %0 = atomicrmw add i32* %p, i32 1 monotonic
   ret void
 }
@@ -113,6 +122,8 @@ define void @dec4(i64* nocapture %p) nounwind ssp {
 entry:
 ; CHECK-LABEL: dec4:
 ; CHECK: decq
+; SLOW_INC-LABEL: dec4:
+; SLOW_INC-NOT: decq
   %0 = atomicrmw sub i64* %p, i64 1 monotonic
   ret void
 }
@@ -138,6 +149,8 @@ define void @dec3(i8* nocapture %p) nounwind ssp {
 entry:
 ; CHECK-LABEL: dec3:
 ; CHECK: decb
+; SLOW_INC-LABEL: dec3:
+; SLOW_INC-NOT: decb
   %0 = atomicrmw sub i8* %p, i8 1 monotonic
   ret void
 }
@@ -163,6 +176,8 @@ define void @dec2(i16* nocapture %p) nounwind ssp {
 entry:
 ; CHECK-LABEL: dec2:
 ; CHECK: decw
+; SLOW_INC-LABEL: dec2:
+; SLOW_INC-NOT: decw
   %0 = atomicrmw sub i16* %p, i16 1 monotonic
   ret void
 }
@@ -189,6 +204,8 @@ define void @dec1(i32* nocapture %p) nounwind ssp {
 entry:
 ; CHECK-LABEL: dec1:
 ; CHECK: decl
+; SLOW_INC-LABEL: dec1:
+; SLOW_INC-NOT: decl
   %0 = atomicrmw sub i32* %p, i32 1 monotonic
   ret void
 }
diff --git a/test/CodeGen/X86/atomic_idempotent.ll b/test/CodeGen/X86/atomic_idempotent.ll
new file mode 100644
index 0000000..1afc535
--- /dev/null
+++ b/test/CodeGen/X86/atomic_idempotent.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc < %s -march=x86 -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+
+; On x86, an atomic rmw operation that does not modify the value in memory
+; (such as atomic add 0) can be replaced by an mfence followed by a mov.
+; This is explained (with the motivation for such an optimization) in
+; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
+
+define i8 @add8(i8* %p) {
+; CHECK-LABEL: add8
+; CHECK: mfence
+; CHECK: movb
+  %1 = atomicrmw add i8* %p, i8 0 monotonic
+  ret i8 %1
+}
+
+define i16 @or16(i16* %p) {
+; CHECK-LABEL: or16
+; CHECK: mfence
+; CHECK: movw
+  %1 = atomicrmw or i16* %p, i16 0 acquire
+  ret i16 %1
+}
+
+define i32 @xor32(i32* %p) {
+; CHECK-LABEL: xor32
+; CHECK: mfence
+; CHECK: movl
+  %1 = atomicrmw xor i32* %p, i32 0 release
+  ret i32 %1
+}
+
+define i64 @sub64(i64* %p) {
+; CHECK-LABEL: sub64
+; X64: mfence
+; X64: movq
+; X32-NOT: mfence
+  %1 = atomicrmw sub i64* %p, i64 0 seq_cst
+  ret i64 %1
+}
+
+define i128 @or128(i128* %p) {
+; CHECK-LABEL: or128
+; CHECK-NOT: mfence
+  %1 = atomicrmw or i128* %p, i128 0 monotonic
+  ret i128 %1
+}
+
+; For 'and', the idempotent value is (-1)
+define i32 @and32 (i32* %p) {
+; CHECK-LABEL: and32
+; CHECK: mfence
+; CHECK: movl
+  %1 = atomicrmw and i32* %p, i32 -1 acq_rel
+  ret i32 %1
+}
diff --git a/test/CodeGen/X86/atomic_mi.ll b/test/CodeGen/X86/atomic_mi.ll
new file mode 100644
index 0000000..19e019e
--- /dev/null
+++ b/test/CodeGen/X86/atomic_mi.ll
@@ -0,0 +1,525 @@
+; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix X64
+; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s --check-prefix X32
+; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
+
+; This file checks that atomic (non-seq_cst) stores of immediate values are
+; done in one mov instruction and not 2. More precisely, it makes sure that the
+; immediate is not first copied uselessly into a register.
+
+; Similarily, it checks that a binary operation of an immediate with an atomic
+; variable that is stored back in that variable is done as a single instruction.
+; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release)
+; should be just an add instruction, instead of loading x into a register, doing
+; an add and storing the result back.
+; The binary operations supported are currently add, and, or, xor.
+; sub is not supported because they are translated by an addition of the
+; negated immediate.
+; Finally, we also check the same kind of pattern for inc/dec
+
+; seq_cst stores are left as (lock) xchgl, but we try to check every other
+; attribute at least once.
+
+; Please note that these operations do not require the lock prefix: only
+; sequentially consistent stores require this kind of protection on X86.
+; And even for seq_cst operations, llvm uses the xchg instruction which has
+; an implicit lock prefix, so making it explicit is not required.
+
+define void @store_atomic_imm_8(i8* %p) {
+; X64-LABEL: store_atomic_imm_8
+; X64: movb
+; X64-NOT: movb
+; X32-LABEL: store_atomic_imm_8
+; X32: movb
+; X32-NOT: movb
+  store atomic i8 42, i8* %p release, align 1
+  ret void
+}
+
+define void @store_atomic_imm_16(i16* %p) {
+; X64-LABEL: store_atomic_imm_16
+; X64: movw
+; X64-NOT: movw
+; X32-LABEL: store_atomic_imm_16
+; X32: movw
+; X32-NOT: movw
+  store atomic i16 42, i16* %p monotonic, align 2
+  ret void
+}
+
+define void @store_atomic_imm_32(i32* %p) {
+; X64-LABEL: store_atomic_imm_32
+; X64: movl
+; X64-NOT: movl
+;   On 32 bits, there is an extra movl for each of those functions
+;   (probably for alignment reasons).
+; X32-LABEL: store_atomic_imm_32
+; X32: movl 4(%esp), %eax
+; X32: movl
+; X32-NOT: movl
+  store atomic i32 42, i32* %p release, align 4
+  ret void
+}
+
+define void @store_atomic_imm_64(i64* %p) {
+; X64-LABEL: store_atomic_imm_64
+; X64: movq
+; X64-NOT: movq
+;   These are implemented with a CAS loop on 32 bit architectures, and thus
+;   cannot be optimized in the same way as the others.
+; X32-LABEL: store_atomic_imm_64
+; X32: cmpxchg8b
+  store atomic i64 42, i64* %p release, align 8
+  ret void
+}
+
+; If an immediate is too big to fit in 32 bits, it cannot be store in one mov,
+; even on X64, one must use movabsq that can only target a register.
+define void @store_atomic_imm_64_big(i64* %p) {
+; X64-LABEL: store_atomic_imm_64_big
+; X64: movabsq
+; X64: movq
+  store atomic i64 100000000000, i64* %p monotonic, align 8
+  ret void
+}
+
+; It would be incorrect to replace a lock xchgl by a movl
+define void @store_atomic_imm_32_seq_cst(i32* %p) {
+; X64-LABEL: store_atomic_imm_32_seq_cst
+; X64: xchgl
+; X32-LABEL: store_atomic_imm_32_seq_cst
+; X32: xchgl
+  store atomic i32 42, i32* %p seq_cst, align 4
+  ret void
+}
+
+; ----- ADD -----
+
+define void @add_8(i8* %p) {
+; X64-LABEL: add_8
+; X64-NOT: lock
+; X64: addb
+; X64-NOT: movb
+; X32-LABEL: add_8
+; X32-NOT: lock
+; X32: addb
+; X32-NOT: movb
+  %1 = load atomic i8* %p seq_cst, align 1
+  %2 = add i8 %1, 2
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @add_16(i16* %p) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: add_16
+; X64-NOT: addw
+; X32-LABEL: add_16
+; X32-NOT: addw
+  %1 = load atomic i16* %p acquire, align 2
+  %2 = add i16 %1, 2
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @add_32(i32* %p) {
+; X64-LABEL: add_32
+; X64-NOT: lock
+; X64: addl
+; X64-NOT: movl
+; X32-LABEL: add_32
+; X32-NOT: lock
+; X32: addl
+; X32-NOT: movl
+  %1 = load atomic i32* %p acquire, align 4
+  %2 = add i32 %1, 2
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret void
+}
+
+define void @add_64(i64* %p) {
+; X64-LABEL: add_64
+; X64-NOT: lock
+; X64: addq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'addq'.
+; X32-LABEL: add_64
+  %1 = load atomic i64* %p acquire, align 8
+  %2 = add i64 %1, 2
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @add_32_seq_cst(i32* %p) {
+; X64-LABEL: add_32_seq_cst
+; X64: xchgl
+; X32-LABEL: add_32_seq_cst
+; X32: xchgl
+  %1 = load atomic i32* %p monotonic, align 4
+  %2 = add i32 %1, 2
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
+; ----- AND -----
+
+define void @and_8(i8* %p) {
+; X64-LABEL: and_8
+; X64-NOT: lock
+; X64: andb
+; X64-NOT: movb
+; X32-LABEL: and_8
+; X32-NOT: lock
+; X32: andb
+; X32-NOT: movb
+  %1 = load atomic i8* %p monotonic, align 1
+  %2 = and i8 %1, 2
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @and_16(i16* %p) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: and_16
+; X64-NOT: andw
+; X32-LABEL: and_16
+; X32-NOT: andw
+  %1 = load atomic i16* %p acquire, align 2
+  %2 = and i16 %1, 2
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @and_32(i32* %p) {
+; X64-LABEL: and_32
+; X64-NOT: lock
+; X64: andl
+; X64-NOT: movl
+; X32-LABEL: and_32
+; X32-NOT: lock
+; X32: andl
+; X32-NOT: movl
+  %1 = load atomic i32* %p acquire, align 4
+  %2 = and i32 %1, 2
+  store atomic i32 %2, i32* %p release, align 4
+  ret void
+}
+
+define void @and_64(i64* %p) {
+; X64-LABEL: and_64
+; X64-NOT: lock
+; X64: andq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'andq'.
+; X32-LABEL: and_64
+  %1 = load atomic i64* %p acquire, align 8
+  %2 = and i64 %1, 2
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @and_32_seq_cst(i32* %p) {
+; X64-LABEL: and_32_seq_cst
+; X64: xchgl
+; X32-LABEL: and_32_seq_cst
+; X32: xchgl
+  %1 = load atomic i32* %p monotonic, align 4
+  %2 = and i32 %1, 2
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
+; ----- OR -----
+
+define void @or_8(i8* %p) {
+; X64-LABEL: or_8
+; X64-NOT: lock
+; X64: orb
+; X64-NOT: movb
+; X32-LABEL: or_8
+; X32-NOT: lock
+; X32: orb
+; X32-NOT: movb
+  %1 = load atomic i8* %p acquire, align 1
+  %2 = or i8 %1, 2
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @or_16(i16* %p) {
+; X64-LABEL: or_16
+; X64-NOT: orw
+; X32-LABEL: or_16
+; X32-NOT: orw
+  %1 = load atomic i16* %p acquire, align 2
+  %2 = or i16 %1, 2
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @or_32(i32* %p) {
+; X64-LABEL: or_32
+; X64-NOT: lock
+; X64: orl
+; X64-NOT: movl
+; X32-LABEL: or_32
+; X32-NOT: lock
+; X32: orl
+; X32-NOT: movl
+  %1 = load atomic i32* %p acquire, align 4
+  %2 = or i32 %1, 2
+  store atomic i32 %2, i32* %p release, align 4
+  ret void
+}
+
+define void @or_64(i64* %p) {
+; X64-LABEL: or_64
+; X64-NOT: lock
+; X64: orq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'orq'.
+; X32-LABEL: or_64
+  %1 = load atomic i64* %p acquire, align 8
+  %2 = or i64 %1, 2
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @or_32_seq_cst(i32* %p) {
+; X64-LABEL: or_32_seq_cst
+; X64: xchgl
+; X32-LABEL: or_32_seq_cst
+; X32: xchgl
+  %1 = load atomic i32* %p monotonic, align 4
+  %2 = or i32 %1, 2
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
+; ----- XOR -----
+
+define void @xor_8(i8* %p) {
+; X64-LABEL: xor_8
+; X64-NOT: lock
+; X64: xorb
+; X64-NOT: movb
+; X32-LABEL: xor_8
+; X32-NOT: lock
+; X32: xorb
+; X32-NOT: movb
+  %1 = load atomic i8* %p acquire, align 1
+  %2 = xor i8 %1, 2
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @xor_16(i16* %p) {
+; X64-LABEL: xor_16
+; X64-NOT: xorw
+; X32-LABEL: xor_16
+; X32-NOT: xorw
+  %1 = load atomic i16* %p acquire, align 2
+  %2 = xor i16 %1, 2
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @xor_32(i32* %p) {
+; X64-LABEL: xor_32
+; X64-NOT: lock
+; X64: xorl
+; X64-NOT: movl
+; X32-LABEL: xor_32
+; X32-NOT: lock
+; X32: xorl
+; X32-NOT: movl
+  %1 = load atomic i32* %p acquire, align 4
+  %2 = xor i32 %1, 2
+  store atomic i32 %2, i32* %p release, align 4
+  ret void
+}
+
+define void @xor_64(i64* %p) {
+; X64-LABEL: xor_64
+; X64-NOT: lock
+; X64: xorq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'xorq'.
+; X32-LABEL: xor_64
+  %1 = load atomic i64* %p acquire, align 8
+  %2 = xor i64 %1, 2
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @xor_32_seq_cst(i32* %p) {
+; X64-LABEL: xor_32_seq_cst
+; X64: xchgl
+; X32-LABEL: xor_32_seq_cst
+; X32: xchgl
+  %1 = load atomic i32* %p monotonic, align 4
+  %2 = xor i32 %1, 2
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
+; ----- INC -----
+
+define void @inc_8(i8* %p) {
+; X64-LABEL: inc_8
+; X64-NOT: lock
+; X64: incb
+; X64-NOT: movb
+; X32-LABEL: inc_8
+; X32-NOT: lock
+; X32: incb
+; X32-NOT: movb
+; SLOW_INC-LABEL: inc_8
+; SLOW_INC-NOT: incb
+; SLOW_INC-NOT: movb
+  %1 = load atomic i8* %p seq_cst, align 1
+  %2 = add i8 %1, 1
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @inc_16(i16* %p) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: inc_16
+; X64-NOT: incw
+; X32-LABEL: inc_16
+; X32-NOT: incw
+; SLOW_INC-LABEL: inc_16
+; SLOW_INC-NOT: incw
+  %1 = load atomic i16* %p acquire, align 2
+  %2 = add i16 %1, 1
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @inc_32(i32* %p) {
+; X64-LABEL: inc_32
+; X64-NOT: lock
+; X64: incl
+; X64-NOT: movl
+; X32-LABEL: inc_32
+; X32-NOT: lock
+; X32: incl
+; X32-NOT: movl
+; SLOW_INC-LABEL: inc_32
+; SLOW_INC-NOT: incl
+; SLOW_INC-NOT: movl
+  %1 = load atomic i32* %p acquire, align 4
+  %2 = add i32 %1, 1
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret void
+}
+
+define void @inc_64(i64* %p) {
+; X64-LABEL: inc_64
+; X64-NOT: lock
+; X64: incq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'incq'.
+; X32-LABEL: inc_64
+; SLOW_INC-LABEL: inc_64
+; SLOW_INC-NOT: incq
+; SLOW_INC-NOT: movq
+  %1 = load atomic i64* %p acquire, align 8
+  %2 = add i64 %1, 1
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @inc_32_seq_cst(i32* %p) {
+; X64-LABEL: inc_32_seq_cst
+; X64: xchgl
+; X32-LABEL: inc_32_seq_cst
+; X32: xchgl
+  %1 = load atomic i32* %p monotonic, align 4
+  %2 = add i32 %1, 1
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
+; ----- DEC -----
+
+define void @dec_8(i8* %p) {
+; X64-LABEL: dec_8
+; X64-NOT: lock
+; X64: decb
+; X64-NOT: movb
+; X32-LABEL: dec_8
+; X32-NOT: lock
+; X32: decb
+; X32-NOT: movb
+; SLOW_INC-LABEL: dec_8
+; SLOW_INC-NOT: decb
+; SLOW_INC-NOT: movb
+  %1 = load atomic i8* %p seq_cst, align 1
+  %2 = sub i8 %1, 1
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @dec_16(i16* %p) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: dec_16
+; X64-NOT: decw
+; X32-LABEL: dec_16
+; X32-NOT: decw
+; SLOW_INC-LABEL: dec_16
+; SLOW_INC-NOT: decw
+  %1 = load atomic i16* %p acquire, align 2
+  %2 = sub i16 %1, 1
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @dec_32(i32* %p) {
+; X64-LABEL: dec_32
+; X64-NOT: lock
+; X64: decl
+; X64-NOT: movl
+; X32-LABEL: dec_32
+; X32-NOT: lock
+; X32: decl
+; X32-NOT: movl
+; SLOW_INC-LABEL: dec_32
+; SLOW_INC-NOT: decl
+; SLOW_INC-NOT: movl
+  %1 = load atomic i32* %p acquire, align 4
+  %2 = sub i32 %1, 1
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret void
+}
+
+define void @dec_64(i64* %p) {
+; X64-LABEL: dec_64
+; X64-NOT: lock
+; X64: decq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'decq'.
+; X32-LABEL: dec_64
+; SLOW_INC-LABEL: dec_64
+; SLOW_INC-NOT: decq
+; SLOW_INC-NOT: movq
+  %1 = load atomic i64* %p acquire, align 8
+  %2 = sub i64 %1, 1
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @dec_32_seq_cst(i32* %p) {
+; X64-LABEL: dec_32_seq_cst
+; X64: xchgl
+; X32-LABEL: dec_32_seq_cst
+; X32: xchgl
+  %1 = load atomic i32* %p monotonic, align 4
+  %2 = sub i32 %1, 1
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 1fd9085..02ea173 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -51,46 +51,6 @@ entry:
   ret <4 x i64> %shuffle
 }
 
-;;;
-;;; Check that some 256-bit vectors are xformed into 128 ops
-; CHECK: _A
-; CHECK: vshufpd $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vshufpd $1
-; CHECK-NEXT: vinsertf128 $1
-define <4 x i64> @A(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: _B
-; CHECK: vshufpd $1, %ymm
-define <4 x i64> @B(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 undef, i32 undef, i32 6>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: movlhps
-; CHECK-NEXT: vextractf128  $1
-; CHECK-NEXT: movlhps
-; CHECK-NEXT: vinsertf128 $1
-define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 undef, i32 0, i32 undef, i32 6>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: vpshufd $-96
-; CHECK: vpshufd $-6
-; CHECK: vinsertf128 $1
-define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 10, i32 10, i32 11, i32 11>
-  ret <8 x i32> %shuffle
-}
-
 ;;; Don't crash on movd
 ; CHECK: _VMOVZQI2PQI
 ; CHECK: vmovd (%
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
deleted file mode 100644
index d2a22d7..0000000
--- a/test/CodeGen/X86/avx-blend.ll
+++ /dev/null
@@ -1,202 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx  -mattr=+avx | FileCheck %s
-
-; AVX128 tests:
-
-;CHECK-LABEL: vsel_float:
-; select mask is <i1 true, i1 false, i1 true, i1 false>.
-; Big endian representation is 0101 = 5.
-; '1' means takes the first argument, '0' means takes the second argument.
-; This is the opposite of the intel syntax, thus we expect
-; the inverted mask: 1010 = 10.
-; According to the ABI:
-; v1 is in xmm0 => first argument is xmm0.
-; v2 is in xmm1 => second argument is xmm1.
-; result is in xmm0 => destination argument.
-;CHECK: vblendps    $10, %xmm1, %xmm0, %xmm0
-;CHECK: ret
-define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
-  ret <4 x float> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i32:
-;CHECK: vblendps   $10, %xmm1, %xmm0, %xmm0
-;CHECK: ret
-define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
-  ret <4 x i32> %vsel
-}
-
-
-;CHECK-LABEL: vsel_double:
-;CHECK: vmovsd
-;CHECK: ret
-define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
-  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
-  ret <2 x double> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i64:
-;CHECK: vmovsd
-;CHECK: ret
-define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) {
-  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2
-  ret <2 x i64> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i8:
-;CHECK: vpblendvb
-;CHECK: ret
-define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
-  %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
-  ret <16 x i8> %vsel
-}
-
-
-; AVX256 tests:
-
-
-;CHECK-LABEL: vsel_float8:
-;CHECK-NOT: vinsertf128
-; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
-; which translates into the boolean mask (big endian representation):
-; 00010001 = 17.
-; '1' means takes the first argument, '0' means takes the second argument.
-; This is the opposite of the intel syntax, thus we expect
-; the inverted mask: 11101110 = 238.
-;CHECK: vblendps    $238, %ymm1, %ymm0, %ymm0
-;CHECK: ret
-define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
-  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
-  ret <8 x float> %vsel
-}
-
-;CHECK-LABEL: vsel_i328:
-;CHECK-NOT: vinsertf128
-;CHECK: vblendps    $238, %ymm1, %ymm0, %ymm0
-;CHECK-NEXT: ret
-define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
-  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
-  ret <8 x i32> %vsel
-}
-
-;CHECK-LABEL: vsel_double8:
-; select mask is 2x: 0001 => intel mask: ~0001 = 14
-; ABI:
-; v1 is in ymm0 and ymm1.
-; v2 is in ymm2 and ymm3.
-; result is in ymm0 and ymm1.
-; Compute the low part: res.low = blend v1.low, v2.low, blendmask
-;CHECK: vblendpd    $14, %ymm2, %ymm0, %ymm0
-; Compute the high part.
-;CHECK: vblendpd    $14, %ymm3, %ymm1, %ymm1
-;CHECK: ret
-define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
-  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
-  ret <8 x double> %vsel
-}
-
-;CHECK-LABEL: vsel_i648:
-;CHECK: vblendpd    $14, %ymm2, %ymm0, %ymm0
-;CHECK: vblendpd    $14, %ymm3, %ymm1, %ymm1
-;CHECK: ret
-define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
-  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
-  ret <8 x i64> %vsel
-}
-
-;CHECK-LABEL: vsel_double4:
-;CHECK-NOT: vinsertf128
-;CHECK: vblendpd $10
-;CHECK-NEXT: ret
-define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
-  ret <4 x double> %vsel
-}
-
-;; TEST blend + compares
-; CHECK: testa
-define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
-  ; CHECK: vcmplepd
-  ; CHECK: vblendvpd
-  %max_is_x = fcmp oge <2 x double> %x, %y
-  %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y
-  ret <2 x double> %max
-}
-
-; CHECK: testb
-define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
-  ; CHECK: vcmpnlepd
-  ; CHECK: vblendvpd
-  %min_is_x = fcmp ult <2 x double> %x, %y
-  %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
-  ret <2 x double> %min
-}
-
-; If we can figure out a blend has a constant mask, we should emit the
-; blend instruction with an immediate mask
-define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
-; CHECK-LABEL: constant_blendvpd_avx:
-; CHECK-NOT: mov
-; CHECK: vblendpd
-; CHECK: ret
-  %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
-  ret <4 x double> %1
-}
-
-define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
-; CHECK-LABEL: constant_blendvps_avx:
-; CHECK-NOT: mov
-; CHECK: vblendps
-; CHECK: ret
-  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
-  ret <8 x float> %1
-}
-
-declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
-
-;; 4 tests for shufflevectors that optimize to blend + immediate
-; CHECK-LABEL: @blend_shufflevector_4xfloat
-define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
-; Equivalent select mask is <i1 true, i1 false, i1 true, i1 false>.
-; Big endian representation is 0101 = 5.
-; '1' means takes the first argument, '0' means takes the second argument.
-; This is the opposite of the intel syntax, thus we expect
-; Inverted mask: 1010 = 10.
-; According to the ABI:
-; a is in xmm0 => first argument is xmm0.
-; b is in xmm1 => second argument is xmm1.
-; Result is in xmm0 => destination argument.
-; CHECK: vblendps $10, %xmm1, %xmm0, %xmm0
-; CHECK: ret
-  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x float> %1
-}
-
-; CHECK-LABEL: @blend_shufflevector_8xfloat
-define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) {
-; CHECK: vblendps $190, %ymm1, %ymm0, %ymm0
-; CHECK: ret
-  %1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15>
-  ret <8 x float> %1
-}
-
-; CHECK-LABEL: @blend_shufflevector_4xdouble
-define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) {
-; CHECK: vblendpd $2, %ymm1, %ymm0, %ymm0
-; CHECK: ret
-  %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
-  ret <4 x double> %1
-}
-
-; CHECK-LABEL: @blend_shufflevector_4xi64
-define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK: vblendpd $13, %ymm1, %ymm0, %ymm0
-; CHECK: ret
-  %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
-  ret <4 x i64> %1
-}
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
index 3e051bf..70ec124 100644
--- a/test/CodeGen/X86/avx-intel-ocl.ll
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -89,23 +89,23 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
 ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
 
 ; X64-LABEL: test_prolog_epilog
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
 ; X64: call
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
    %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
    ret <16 x float> %c
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
new file mode 100644
index 0000000..d2b44cd
--- /dev/null
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  ; CHECK: vblendpd
+  %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone
+
+
+define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  ; CHECK: vblendps
+  %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+
+
+define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  ; CHECK: vdpps
+  %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+
+
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index ce31161..ef3e83f 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -455,21 +455,21 @@ define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
-  ; CHECK: vpslldq
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
-  ; CHECK: vpslldq
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
+
+
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+  ; CHECK: vpslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
+  ; CHECK: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
 declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
 
 
@@ -551,21 +551,21 @@ define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
-  ; CHECK: vpsrldq
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
-  ; CHECK: vpsrldq
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+  ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
+  ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
 declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
 
 
@@ -818,18 +818,18 @@ declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
 
 define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
   ; CHECK: vblendpd
-  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
 
 
 define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vblendps
-  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 
 define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
@@ -850,35 +850,35 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
 
 define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
   ; CHECK: vdppd
-  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
 
 
 define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vdpps
-  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 
 define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vinsertps
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 
 
 define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vmpsadbw
-  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
-declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
 define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
@@ -899,10 +899,10 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun
 
 define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
   ; CHECK: vpblendw
-  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
-declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
 
 
 define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
@@ -1770,18 +1770,18 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
 
 define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
   ; CHECK: vblendpd
-  %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
+  %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
   ret <4 x double> %res
 }
-declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone
+declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
 
 
 define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
   ; CHECK: vblendps
-  %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
+  %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
 
 
 define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
@@ -1950,10 +1950,10 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
 
 define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
   ; CHECK: vdpps
-  %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
+  %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
 
 
 define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) {
@@ -2309,7 +2309,7 @@ declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) noun
 
 define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) {
   ; CHECK: vpermilpd
-  %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 7) ; <<2 x double>> [#uses=1]
+  %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnone
@@ -2324,7 +2324,7 @@ declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind rea
 
 
 define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
-  ; CHECK: vpshufd
+  ; CHECK: vpermilps
   %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
diff --git a/test/CodeGen/X86/avx-movdup.ll b/test/CodeGen/X86/avx-movdup.ll
deleted file mode 100644
index 42d84de..0000000
--- a/test/CodeGen/X86/avx-movdup.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vmovsldup
-define <8 x float> @movdupA(<8 x float> %src) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <8 x float> %src, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-  ret <8 x float> %shuffle.i
-}
-
-; CHECK: vmovshdup
-define <8 x float> @movdupB(<8 x float> %src) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <8 x float> %src, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
-  ret <8 x float> %shuffle.i
-}
-
-; CHECK: vmovsldup
-define <4 x i64> @movdupC(<4 x i64> %src) nounwind uwtable readnone ssp {
-entry:
-  %0 = bitcast <4 x i64> %src to <8 x float>
-  %shuffle.i = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-  %1 = bitcast <8 x float> %shuffle.i to <4 x i64>
-  ret <4 x i64> %1
-}
-
-; CHECK: vmovshdup
-define <4 x i64> @movdupD(<4 x i64> %src) nounwind uwtable readnone ssp {
-entry:
-  %0 = bitcast <4 x i64> %src to <8 x float>
-  %shuffle.i = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
-  %1 = bitcast <8 x float> %shuffle.i to <4 x i64>
-  ret <4 x i64> %1
-}
-
diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll
deleted file mode 100644
index fb2287f..0000000
--- a/test/CodeGen/X86/avx-sext.ll
+++ /dev/null
@@ -1,199 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=pentium4 | FileCheck %s -check-prefix=SSE2
-
-define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-; AVX: sext_8i16_to_8i32
-; AVX: vpmovsxwd
-
-  %B = sext <8 x i16> %A to <8 x i32>
-  ret <8 x i32>%B
-}
-
-define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-; AVX: sext_4i32_to_4i64
-; AVX: vpmovsxdq
-
-  %B = sext <4 x i32> %A to <4 x i64>
-  ret <4 x i64>%B
-}
-
-; AVX: load_sext_test1
-; AVX: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test1
-; SSSE3: movq
-; SSSE3: punpcklwd %xmm{{.*}}, %xmm{{.*}}
-; SSSE3: psrad $16
-; SSSE3: ret
-
-; SSE2: load_sext_test1
-; SSE2: movq
-; SSE2: punpcklwd %xmm{{.*}}, %xmm{{.*}}
-; SSE2: psrad $16
-; SSE2: ret
-define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
- %X = load <4 x i16>* %ptr
- %Y = sext <4 x i16> %X to <4 x i32>
- ret <4 x i32>%Y
-}
-
-; AVX: load_sext_test2
-; AVX: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test2
-; SSSE3: movd
-; SSSE3: pshufb
-; SSSE3: psrad $24
-; SSSE3: ret
-
-; SSE2: load_sext_test2
-; SSE2: movl
-; SSE2: psrad $24
-; SSE2: ret
-define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
- %X = load <4 x i8>* %ptr
- %Y = sext <4 x i8> %X to <4 x i32>
- ret <4 x i32>%Y
-}
-
-; AVX: load_sext_test3
-; AVX: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test3
-; SSSE3: movsbq
-; SSSE3: movsbq
-; SSSE3: punpcklqdq
-; SSSE3: ret
-
-; SSE2: load_sext_test3
-; SSE2: movsbq
-; SSE2: movsbq
-; SSE2: punpcklqdq
-; SSE2: ret
-define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
- %X = load <2 x i8>* %ptr
- %Y = sext <2 x i8> %X to <2 x i64>
- ret <2 x i64>%Y
-}
-
-; AVX: load_sext_test4
-; AVX: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test4
-; SSSE3: movswq
-; SSSE3: movswq
-; SSSE3: punpcklqdq
-; SSSE3: ret
-
-; SSE2: load_sext_test4
-; SSE2: movswq
-; SSE2: movswq
-; SSE2: punpcklqdq
-; SSE2: ret
-define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
- %X = load <2 x i16>* %ptr
- %Y = sext <2 x i16> %X to <2 x i64>
- ret <2 x i64>%Y
-}
-
-; AVX: load_sext_test5
-; AVX: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test5
-; SSSE3: movslq
-; SSSE3: movslq
-; SSSE3: punpcklqdq
-; SSSE3: ret
-
-; SSE2: load_sext_test5
-; SSE2: movslq
-; SSE2: movslq
-; SSE2: punpcklqdq
-; SSE2: ret
-define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
- %X = load <2 x i32>* %ptr
- %Y = sext <2 x i32> %X to <2 x i64>
- ret <2 x i64>%Y
-}
-
-; AVX: load_sext_test6
-; AVX: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test6
-; SSSE3: movq
-; SSSE3: punpcklbw
-; SSSE3: psraw $8
-; SSSE3: ret
-
-; SSE2: load_sext_test6
-; SSE2: movq
-; SSE2: punpcklbw
-; SSE2: psraw $8
-; SSE2: ret
-define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
- %X = load <8 x i8>* %ptr
- %Y = sext <8 x i8> %X to <8 x i16>
- ret <8 x i16>%Y
-}
-
-; AVX: sext_4i1_to_4i64
-; AVX: vpslld  $31
-; AVX: vpsrad  $31
-; AVX: vpmovsxdq
-; AVX: vpmovsxdq
-; AVX: ret
-define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
-  %extmask = sext <4 x i1> %mask to <4 x i64>
-  ret <4 x i64> %extmask
-}
-
-; AVX-LABEL: sext_16i8_to_16i16
-; AVX: vpmovsxbw
-; AVX: vmovhlps
-; AVX: vpmovsxbw
-; AVX: ret
-define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
- %X = load <16 x i8>* %ptr
- %Y = sext <16 x i8> %X to <16 x i16>
- ret <16 x i16> %Y
-}
-
-; AVX: sext_4i8_to_4i64
-; AVX: vpslld  $24
-; AVX: vpsrad  $24
-; AVX: vpmovsxdq
-; AVX: vpmovsxdq
-; AVX: ret
-define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
-  %extmask = sext <4 x i8> %mask to <4 x i64>
-  ret <4 x i64> %extmask
-}
-
-; AVX: sext_4i8_to_4i64
-; AVX: vpmovsxbd
-; AVX: vpmovsxdq
-; AVX: vpmovsxdq
-; AVX: ret
-define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
- %X = load <4 x i8>* %ptr
- %Y = sext <4 x i8> %X to <4 x i64>
- ret <4 x i64>%Y
-}
-
-; AVX: sext_4i16_to_4i64
-; AVX: vpmovsxwd
-; AVX: vpmovsxdq
-; AVX: vpmovsxdq
-; AVX: ret
-define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
- %X = load <4 x i16>* %ptr
- %Y = sext <4 x i16> %X to <4 x i64>
- ret <4 x i64>%Y
-}
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
deleted file mode 100644
index 4a996d7..0000000
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ /dev/null
@@ -1,336 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; PR11102
-define <4 x float> @test1(<4 x float> %a) nounwind {
-  %b = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 5, i32 undef, i32 undef>
-  ret <4 x float> %b
-; CHECK-LABEL: test1:
-;; TODO: This test could be improved by removing the xor instruction and
-;; having vinsertps zero out the needed elements.
-; CHECK: vxorps
-; CHECK: vinsertps
-}
-
-; rdar://10538417
-define <3 x i64> @test2(<2 x i64> %v) nounwind readnone {
-; CHECK-LABEL: test2:
-; CHECK: vinsertf128
-  %1 = shufflevector <2 x i64> %v, <2 x i64> %v, <3 x i32> <i32 0, i32 1, i32 undef>
-  %2 = shufflevector <3 x i64> zeroinitializer, <3 x i64> %1, <3 x i32> <i32 3, i32 4, i32 2>
-  ret <3 x i64> %2
-; CHECK: ret
-}
-
-define <4 x i64> @test3(<4 x i64> %a, <4 x i64> %b) nounwind {
-  %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 undef>
-  ret <4 x i64> %c
-; CHECK-LABEL: test3:
-; CHECK: vblendpd
-; CHECK: ret
-}
-
-define <8 x float> @test4(float %a) nounwind {
-  %b = insertelement <8 x float> zeroinitializer, float %a, i32 0
-  ret <8 x float> %b
-; CHECK-LABEL: test4:
-; CHECK: vinsertf128
-}
-
-; rdar://10594409
-define <8 x float> @test5(float* nocapture %f) nounwind uwtable readonly ssp {
-entry:
-  %0 = bitcast float* %f to <4 x float>*
-  %1 = load <4 x float>* %0, align 16
-; CHECK: test5
-; CHECK: vmovaps
-; CHECK-NOT: vxorps
-; CHECK-NOT: vinsertf128
-  %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-  ret <8 x float> %shuffle.i
-}
-
-define <4 x double> @test6(double* nocapture %d) nounwind uwtable readonly ssp {
-entry:
-  %0 = bitcast double* %d to <2 x double>*
-  %1 = load <2 x double>* %0, align 16
-; CHECK: test6
-; CHECK: vmovaps
-; CHECK-NOT: vxorps
-; CHECK-NOT: vinsertf128
-  %shuffle.i = shufflevector <2 x double> %1, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-  ret <4 x double> %shuffle.i
-}
-
-define <16 x i16> @test7(<4 x i16> %a) nounwind {
-; CHECK: test7
-  %b = shufflevector <4 x i16> %a, <4 x i16> undef, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK: ret
-  ret <16 x i16> %b
-}
-
-; CHECK: test8
-define void @test8() {
-entry:
-  %0 = load <16 x i64> addrspace(1)* null, align 128
-  %1 = shufflevector <16 x i64> <i64 undef, i64 undef, i64 0, i64 undef, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 undef, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> %0, <16 x i32> <i32 17, i32 18, i32 2, i32 undef, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 26>
-  %2 = shufflevector <16 x i64> %1, <16 x i64> %0, <16 x i32> <i32 0, i32 1, i32 2, i32 30, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 22, i32 20, i32 15>
-  store <16 x i64> %2, <16 x i64> addrspace(1)* undef, align 128
-; CHECK: ret
-  ret void
-}
-
-; Extract a value from a shufflevector..
-define i32 @test9(<4 x i32> %a) nounwind {
-; CHECK: test9
-; CHECK: vpextrd
-  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4>
-  %r = extractelement <8 x i32> %b, i32 2
-; CHECK: ret
-  ret i32 %r
-}
-
-; Extract a value which is the result of an undef mask.
-define i32 @test10(<4 x i32> %a) nounwind {
-; CHECK: @test10
-; CHECK-NOT: {{^[^#]*[a-z]}}
-; CHECK: ret
-  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %r = extractelement <8 x i32> %b, i32 2
-  ret i32 %r
-}
-
-define <4 x float> @test11(<4 x float> %a) nounwind  {
-; CHECK: test11
-; CHECK: vpshufd $27
-  %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ret <4 x float> %tmp1
-}
-
-define <4 x float> @test12(<4 x float>* %a) nounwind  {
-; CHECK: test12
-; CHECK: vpshufd
-  %tmp0 = load <4 x float>* %a
-  %tmp1 = shufflevector <4 x float> %tmp0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ret <4 x float> %tmp1
-}
-
-define <4 x i32> @test13(<4 x i32> %a) nounwind  {
-; CHECK: test13
-; CHECK: vpshufd $27
-  %tmp1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test14(<4 x i32>* %a) nounwind  {
-; CHECK: test14
-; CHECK: vpshufd $27, (
-  %tmp0 = load <4 x i32>* %a
-  %tmp1 = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ret <4 x i32> %tmp1
-}
-
-; CHECK: test15
-; CHECK: vpshufd $8
-; CHECK: ret
-define <4 x i32> @test15(<2 x i32>%x) nounwind readnone {
-  %x1 = shufflevector <2 x i32> %x, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  ret <4 x i32>%x1
-}
-
-; rdar://10974078
-define <8 x float> @test16(float* nocapture %f) nounwind uwtable readonly ssp {
-entry:
-  %0 = bitcast float* %f to <4 x float>*
-  %1 = load <4 x float>* %0, align 8
-; CHECK: test16
-; CHECK: vmovups
-; CHECK-NOT: vxorps
-; CHECK-NOT: vinsertf128
-  %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-  ret <8 x float> %shuffle.i
-}
-
-; PR12413
-; CHECK: shuf1
-; CHECK: vpshufb
-; CHECK: vpshufb
-; CHECK: vpshufb
-; CHECK: vpshufb
-define <32 x i8> @shuf1(<32 x i8> %inval1, <32 x i8> %inval2) {
-entry:
- %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
- ret <32 x i8> %0
-}
-
-; handle the case where only half of the 256-bits is splittable
-; CHECK: shuf2
-; CHECK: vpshufb
-; CHECK: vpshufb
-; CHECK: vpextrb
-; CHECK: vpextrb
-define <32 x i8> @shuf2(<32 x i8> %inval1, <32 x i8> %inval2) {
-entry:
- %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 31, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
- ret <32 x i8> %0
-}
-
-; CHECK: blend1
-; CHECK: vblendps
-; CHECK: ret
-define <4 x i32> @blend1(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
-  %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-  ret <4 x i32> %t
-}
-
-; CHECK: blend2
-; CHECK: vblendps
-; CHECK: ret
-define <4 x i32> @blend2(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
-  %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x i32> %t
-}
-
-; CHECK: blend2a
-; CHECK: vblendps
-; CHECK: ret
-define <4 x float> @blend2a(<4 x float> %a, <4 x float> %b) nounwind alwaysinline {
-  %t = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x float> %t
-}
-
-; CHECK: blend3
-; CHECK-NOT: vblendps
-; CHECK: ret
-define <4 x i32> @blend3(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
-  %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 2, i32 7>
-  ret <4 x i32> %t
-}
-
-; CHECK: blend4
-; CHECK: vblendpd
-; CHECK: ret
-define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
-  %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-  ret <4 x i64> %t
-}
-
-; CHECK: narrow
-; CHECK: vpermilps
-; CHECK: ret
-define <16 x i16> @narrow(<16 x i16> %a) nounwind alwaysinline {
-  %t = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 undef, i32 14, i32 15, i32 undef, i32 undef>
-  ret <16 x i16> %t
-}
-
-;CHECK-LABEL: test17:
-;CHECK-NOT: vinsertf128
-;CHECK: ret
-define   <8 x float> @test17(<4 x float> %y) {
-  %x = shufflevector <4 x float> %y, <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <8 x float> %x
-}
-
-; CHECK: test18
-; CHECK: vmovshdup
-; CHECK: vblendps
-; CHECK: ret
-define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
-  %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  ret <8 x float>%S
-}
-
-; CHECK: test19
-; CHECK: vmovsldup
-; CHECK: vblendps
-; CHECK: ret
-define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
-  %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  ret <8 x float>%S
-}
-
-; rdar://12684358
-; Make sure loads happen before stores.
-; CHECK: swap8doubles
-; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
-; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
-; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
-; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
-; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
-; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
-; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
-; CHECK: vextractf128
-; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
-; CHECK: vextractf128
-; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi)
-; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi)
-define void @swap8doubles(double* nocapture %A, double* nocapture %C) nounwind uwtable ssp {
-entry:
-  %add.ptr = getelementptr inbounds double* %A, i64 2
-  %v.i = bitcast double* %A to <2 x double>*
-  %0 = load <2 x double>* %v.i, align 1
-  %shuffle.i.i = shufflevector <2 x double> %0, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-  %v1.i = bitcast double* %add.ptr to <2 x double>*
-  %1 = load <2 x double>* %v1.i, align 1
-  %2 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i, <2 x double> %1, i8 1) nounwind
-  %add.ptr1 = getelementptr inbounds double* %A, i64 6
-  %add.ptr2 = getelementptr inbounds double* %A, i64 4
-  %v.i27 = bitcast double* %add.ptr2 to <2 x double>*
-  %3 = load <2 x double>* %v.i27, align 1
-  %shuffle.i.i28 = shufflevector <2 x double> %3, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-  %v1.i29 = bitcast double* %add.ptr1 to <2 x double>*
-  %4 = load <2 x double>* %v1.i29, align 1
-  %5 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i28, <2 x double> %4, i8 1) nounwind
-  %6 = bitcast double* %C to <4 x double>*
-  %7 = load <4 x double>* %6, align 32
-  %add.ptr5 = getelementptr inbounds double* %C, i64 4
-  %8 = bitcast double* %add.ptr5 to <4 x double>*
-  %9 = load <4 x double>* %8, align 32
-  %shuffle.i26 = shufflevector <4 x double> %7, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  %10 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %7, i8 1)
-  %shuffle.i = shufflevector <4 x double> %9, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  %11 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %9, i8 1)
-  store <2 x double> %shuffle.i26, <2 x double>* %v.i, align 16
-  store <2 x double> %10, <2 x double>* %v1.i, align 16
-  store <2 x double> %shuffle.i, <2 x double>* %v.i27, align 16
-  store <2 x double> %11, <2 x double>* %v1.i29, align 16
-  store <4 x double> %2, <4 x double>* %6, align 32
-  store <4 x double> %5, <4 x double>* %8, align 32
-  ret void
-}
-declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
-declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
-
-; this test case just should not fail
-define void @test20() {
-  %a0 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double 0.000000e+00, i32 2
-  store <3 x double> %a0, <3 x double>* undef, align 1
-  %a1 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double undef, i32 2
-  store <3 x double> %a1, <3 x double>* undef, align 1
-  ret void
-}
-
-define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
-; CHECK-LABEL: test_insert_64_zext
-; CHECK-NOT: xor
-; CHECK: vmovq
-  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
-  ret <2 x i64> %1
-}
-
-;; Ensure we don't use insertps from non v4x32 vectors.
-;; On SSE4.1 it works because bigger vectors use more than 1 register.
-;; On AVX they get passed in a single register.
-;; FIXME: We could probably optimize this case, if we're only using the
-;; first 4 indices.
-define <4 x i32> @insert_from_diff_size(<8 x i32> %x) {
-; CHECK-LABEL: insert_from_diff_size:
-; CHECK-NOT: insertps
-; CHECK: ret
-  %vecext = extractelement <8 x i32> %x, i32 0
-  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
-  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
-  %a.0 = extractelement <8 x i32> %x, i32 0
-  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a.0, i32 3
-  ret <4 x i32> %vecinit3
-}
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index b1b2f8b..98c1645 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -1,9 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
 
-; CHECK: vpunpcklbw %xmm
-; CHECK-NEXT: vpunpckhbw %xmm
-; CHECK-NEXT: vpshufd $85
+; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; CHECK-NEXT: vinsertf128 $1
 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
 entry:
@@ -11,8 +9,7 @@ entry:
   ret <32 x i8> %shuffle
 }
 
-; CHECK: vpunpckhwd %xmm
-; CHECK-NEXT: vpshufd $85
+; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11]
 ; CHECK-NEXT: vinsertf128 $1
 define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
 entry:
@@ -21,7 +18,7 @@ entry:
 }
 
 ; CHECK: vmovq
-; CHECK-NEXT: vmovlhps %xmm
+; CHECK-NEXT: vunpcklpd %xmm
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
 entry:
@@ -32,7 +29,7 @@ entry:
   ret <4 x i64> %vecinit6.i
 }
 
-; CHECK: vpermilpd $0
+; CHECK: vunpcklpd %xmm
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
 entry:
@@ -72,7 +69,7 @@ __load_and_broadcast_32.exit1249:                 ; preds = %load.i1247, %for_ex
   ret <8 x float> %load_broadcast12281250
 }
 
-; CHECK: vpshufd $0
+; CHECK: vpermilps $4
 ; CHECK-NEXT: vinsertf128 $1
 define <8 x float> @funcF(i32 %val) nounwind {
   %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
@@ -81,7 +78,7 @@ define <8 x float> @funcF(i32 %val) nounwind {
   ret <8 x float> %tmp
 }
 
-; CHECK: vpshufd  $0
+; CHECK: vpermilps $0
 ; CHECK-NEXT: vinsertf128  $1
 define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
 entry:
@@ -90,7 +87,7 @@ entry:
 }
 
 ; CHECK: vextractf128  $1
-; CHECK-NEXT: vpshufd
+; CHECK-NEXT: vpermilps $85
 ; CHECK-NEXT: vinsertf128  $1
 define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/avx-vmovddup.ll b/test/CodeGen/X86/avx-vmovddup.ll
deleted file mode 100644
index 1c56fe2..0000000
--- a/test/CodeGen/X86/avx-vmovddup.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vmovddup %ymm
-define <4 x i64> @A(<4 x i64> %a) {
-  %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-  ret <4 x i64> %c
-}
-
-; CHECK: vmovddup (%
-define <4 x i64> @B(<4 x i64>* %ptr) {
-  %a = load <4 x i64>* %ptr
-  %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-  ret <4 x i64> %c
-}
diff --git a/test/CodeGen/X86/avx-vperm2f128.ll b/test/CodeGen/X86/avx-vperm2f128.ll
deleted file mode 100644
index c20775b..0000000
--- a/test/CodeGen/X86/avx-vperm2f128.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: _A
-; CHECK: vperm2f128 $1
-define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: _B
-; CHECK: vblendps $240
-define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: _C
-; CHECK: vperm2f128 $0
-define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: _D
-; CHECK: vperm2f128 $17
-define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: _E
-; CHECK: vperm2f128 $17
-define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ret <32 x i8> %shuffle
-}
-
-; CHECK: _E2
-; CHECK: vperm2f128 $3
-define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
-  ret <4 x i64> %shuffle
-}
-
-;;;; Cases with undef indicies mixed in the mask
-
-; CHECK: _F
-; CHECK: vperm2f128 $33
-define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
-  ret <8 x float> %shuffle
-}
-
-;;;; Cases we must not select vperm2f128
-
-; CHECK: _G
-; CHECK-NOT: vperm2f128
-define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
-  ret <8 x float> %shuffle
-}
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
new file mode 100644
index 0000000..a103405
--- /dev/null
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -0,0 +1,202 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+
+define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: A:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: B:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: C:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: D:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: E:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: E2:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: Ei:
+; AVX1:       ## BB#0: ## %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: Ei:
+; AVX2:       ## BB#0: ## %entry
+; AVX2-NEXT:    vpaddb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT:    retq
+entry:
+  ; add forces execution domain
+  %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: E2i:
+; AVX1:       ## BB#0: ## %entry
+; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: E2i:
+; AVX2:       ## BB#0: ## %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX2-NEXT:    retq
+entry:
+  ; add forces execution domain
+  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
+  %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: E3i:
+; AVX1:       ## BB#0: ## %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: E3i:
+; AVX2:       ## BB#0: ## %entry
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT:    retq
+entry:
+  ; add forces execution domain
+  %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i32> %shuffle
+}
+
+define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: E4i:
+; AVX1:       ## BB#0: ## %entry
+; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: E4i:
+; AVX2:       ## BB#0: ## %entry
+; AVX2-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  ; add forces execution domain
+  %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: E5i:
+; AVX1:       ## BB#0: ## %entry
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vmovaps (%rsi), %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: E5i:
+; AVX2:       ## BB#0: ## %entry
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX2-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %c = load <16 x i16>* %a
+  %d = load <16 x i16>* %b
+  %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i16> %shuffle
+}
+
+;;;; Cases with undef indicies mixed in the mask
+
+define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: F:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1,0,1]
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
+  ret <8 x float> %shuffle
+}
+
+;;;; Cases we must not select vperm2f128
+
+define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: G:
+; AVX1:       ## BB#0: ## %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: G:
+; AVX2:       ## BB#0: ## %entry
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
+; AVX2-NEXT:    retq
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
+  ret <8 x float> %shuffle
+}
diff --git a/test/CodeGen/X86/avx-vpermil.ll b/test/CodeGen/X86/avx-vpermil.ll
deleted file mode 100644
index b7f8d72..0000000
--- a/test/CodeGen/X86/avx-vpermil.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vpermilps
-define <8 x float> @funcA(<8 x float> %a) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7, i32 5>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: vpermilpd
-define <4 x double> @funcB(<4 x double> %a) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3>
-  ret <4 x double> %shuffle
-}
-
-; CHECK: vpermilps
-define <8 x i32> @funcC(<8 x i32> %a) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7, i32 5>
-  ret <8 x i32> %shuffle
-}
-
-; CHECK: vpermilpd
-define <4 x i64> @funcD(<4 x i64> %a) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: vpermilpd
-define <4 x i64> @funcQ(<4 x i64>* %a) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <4 x i64>* %a
-  %shuffle = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3>
-  ret <4 x i64> %shuffle
-}
-
-; vpermil should match masks like this: <u,3,1,2,4,u,5,6>. Check that the
-; target specific mask was correctly generated.
-; CHECK: vpermilps $-100
-define <8 x float> @funcE(<8 x float> %a) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 8, i32 3, i32 1, i32 2, i32 4, i32 8, i32 5, i32 6>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: palignr $8
-; CHECK: palignr $8
-define <8 x float> @funcF(<8 x float> %a) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-  ret <8 x float> %shuffle
-}
diff --git a/test/CodeGen/X86/avx-vshufp.ll b/test/CodeGen/X86/avx-vshufp.ll
deleted file mode 100644
index ad3dbc1..0000000
--- a/test/CodeGen/X86/avx-vshufp.ll
+++ /dev/null
@@ -1,157 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vshufps  $-53, %ymm
-define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: vshufps  $-53, (%{{.*}}), %ymm
-define <8 x float> @A2(<8 x float>* %a, <8 x float>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <8 x float>* %a
-  %b2 = load <8 x float>* %b
-  %shuffle = shufflevector <8 x float> %a2, <8 x float> %b2, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: vshufps  $-53, %ymm
-define <8 x i32> @A3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
-  ret <8 x i32> %shuffle
-}
-
-; CHECK: vshufps  $-53, (%{{.*}}), %ymm
-define <8 x i32> @A4(<8 x i32>* %a, <8 x i32>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <8 x i32>* %a
-  %b2 = load <8 x i32>* %b
-  %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b2, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
-  ret <8 x i32> %shuffle
-}
-
-; CHECK: vblendpd  $10, %ymm
-define <4 x double> @B(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x double> %shuffle
-}
-
-; CHECK: vblendpd  $10, (%{{.*}}), %ymm
-define <4 x double> @B2(<4 x double>* %a, <4 x double>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <4 x double>* %a
-  %b2 = load <4 x double>* %b
-  %shuffle = shufflevector <4 x double> %a2, <4 x double> %b2, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x double> %shuffle
-}
-
-; CHECK: vblendpd  $10, %ymm
-define <4 x i64> @B3(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: vblendpd  $10, (%{{.*}}), %ymm
-define <4 x i64> @B4(<4 x i64>* %a, <4 x i64>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <4 x i64>* %a
-  %b2 = load <4 x i64>* %b
-  %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b2, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: vshufps  $-53, %ymm
-define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 undef, i32 undef, i32 11, i32 undef, i32 6, i32 12, i32 undef>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: vblendpd  $2, %ymm
-define <4 x double> @D(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 undef>
-  ret <4 x double> %shuffle
-}
-
-; CHECK: vshufps $-55, %ymm
-define <8 x float> @E(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 10, i32 0, i32 3, i32 13, i32 14, i32 4, i32 7>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: vshufpd  $8, %ymm
-define <4 x double> @F(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 7>
-  ret <4 x double> %shuffle
-}
-
-; CHECK: vshufps  $-53, %xmm
-define <4 x float> @A128(<4 x float> %a, <4 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 4, i32 7>
-  ret <4 x float> %shuffle
-}
-
-; CHECK: vshufps  $-53, (%{{.*}}), %xmm
-define <4 x float> @A2128(<4 x float>* %a, <4 x float>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <4 x float>* %a
-  %b2 = load <4 x float>* %b
-  %shuffle = shufflevector <4 x float> %a2, <4 x float> %b2, <4 x i32> <i32 3, i32 2, i32 4, i32 7>
-  ret <4 x float> %shuffle
-}
-
-; CHECK: vshufps  $-53, %xmm
-define <4 x i32> @A3128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 4, i32 7>
-  ret <4 x i32> %shuffle
-}
-
-; CHECK: vshufps  $-53, (%{{.*}}), %xmm
-define <4 x i32> @A4128(<4 x i32>* %a, <4 x i32>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <4 x i32>* %a
-  %b2 = load <4 x i32>* %b
-  %shuffle = shufflevector <4 x i32> %a2, <4 x i32> %b2, <4 x i32> <i32 3, i32 2, i32 4, i32 7>
-  ret <4 x i32> %shuffle
-}
-
-; CHECK: vshufpd  $1, %xmm
-define <2 x double> @B128(<2 x double> %a, <2 x double> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2>
-  ret <2 x double> %shuffle
-}
-
-; CHECK: vshufpd  $1, (%{{.*}}), %xmm
-define <2 x double> @B2128(<2 x double>* %a, <2 x double>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <2 x double>* %a
-  %b2 = load <2 x double>* %b
-  %shuffle = shufflevector <2 x double> %a2, <2 x double> %b2, <2 x i32> <i32 1, i32 2>
-  ret <2 x double> %shuffle
-}
-
-; CHECK: vshufpd  $1, %xmm
-define <2 x i64> @B3128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
-  ret <2 x i64> %shuffle
-}
-
-; CHECK: vshufpd  $1, (%{{.*}}), %xmm
-define <2 x i64> @B4128(<2 x i64>* %a, <2 x i64>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <2 x i64>* %a
-  %b2 = load <2 x i64>* %b
-  %shuffle = shufflevector <2 x i64> %a2, <2 x i64> %b2, <2 x i32> <i32 1, i32 2>
-  ret <2 x i64> %shuffle
-}
diff --git a/test/CodeGen/X86/avx-zext.ll b/test/CodeGen/X86/avx-zext.ll
deleted file mode 100644
index 7511746..0000000
--- a/test/CodeGen/X86/avx-zext.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-;CHECK-LABEL: zext_8i16_to_8i32:
-;CHECK: vpunpckhwd
-;CHECK: ret
-
-  %B = zext <8 x i16> %A to <8 x i32>
-  ret <8 x i32>%B
-}
-
-define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-;CHECK-LABEL: zext_4i32_to_4i64:
-;CHECK: vpunpckhdq
-;CHECK: ret
-
-  %B = zext <4 x i32> %A to <4 x i64>
-  ret <4 x i64>%B
-}
-
-define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
-;CHECK-LABEL: zext_8i8_to_8i32:
-;CHECK: vpunpckhwd
-;CHECK: vpmovzxwd
-;CHECK: vinsertf128
-;CHECK: ret
-  %t = zext <8 x i8> %z to <8 x i32>
-  ret <8 x i32> %t
-}
-
-; PR17654
-define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
-; CHECK-LABEL: zext_16i8_to_16i16:
-; CHECK: vpxor
-; CHECK: vpunpckhbw
-; CHECK: vpunpcklbw
-; CHECK: vinsertf128
-; CHECK: ret
-  %t = zext <16 x i8> %z to <16 x i16>
-  ret <16 x i16> %t
-}
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
index 6069c14..cba6d98 100644
--- a/test/CodeGen/X86/avx.ll
+++ b/test/CodeGen/X86/avx.ll
@@ -60,7 +60,7 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
 ; X32: movl    8(%esp), %ecx
 ; CHECK-NOT: mov
 ;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: vinsertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK: vinsertps    $-64, 12(%{{...}},%{{...}}), %
 ; CHECK-NEXT: ret
   %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
   %2 = load <4 x float>* %1, align 16
diff --git a/test/CodeGen/X86/avx1-stack-reload-folding.ll b/test/CodeGen/X86/avx1-stack-reload-folding.ll
new file mode 100644
index 0000000..2e669b0
--- /dev/null
+++ b/test/CodeGen/X86/avx1-stack-reload-folding.ll
@@ -0,0 +1,68 @@
+; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests - we use the 'big vectors' pattern to guarantee spilling to stack.
+;
+; Many of these tests are primarily to check memory folding with specific instructions. Using a basic
+; load/cvt/store pattern to test for this would mean that it wouldn't be the memory folding code thats
+; being tested - the load-execute version of the instruction from the tables would be matched instead.
+
+define void @stack_fold_vmulpd(<64 x double>* %a, <64 x double>* %b, <64 x double>* %c) {
+  ;CHECK-LABEL: stack_fold_vmulpd
+  ;CHECK:       vmulpd {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+
+  %1 = load <64 x double>* %a
+  %2 = load <64 x double>* %b
+  %3 = fadd <64 x double> %1, %2
+  %4 = fsub <64 x double> %1, %2
+  %5 = fmul <64 x double> %3, %4
+  store <64 x double> %5, <64 x double>* %c
+  ret void
+}
+
+define void @stack_fold_cvtdq2ps(<128 x i32>* %a, <128 x i32>* %b, <128 x float>* %c) {
+  ;CHECK-LABEL: stack_fold_cvtdq2ps
+  ;CHECK:   vcvtdq2ps {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+
+  %1 = load <128 x i32>* %a
+  %2 = load <128 x i32>* %b
+  %3 = and <128 x i32> %1, %2
+  %4 = xor <128 x i32> %1, %2
+  %5 = sitofp <128 x i32> %3 to <128 x float>
+  %6 = sitofp <128 x i32> %4 to <128 x float>
+  %7 = fadd <128 x float> %5, %6
+  store <128 x float> %7, <128 x float>* %c
+  ret void
+}
+
+define void @stack_fold_cvttpd2dq(<64 x double>* %a, <64 x double>* %b, <64 x i32>* %c) #0 {
+  ;CHECK-LABEL: stack_fold_cvttpd2dq
+  ;CHECK:  vcvttpd2dqy {{[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+
+  %1 = load <64 x double>* %a
+  %2 = load <64 x double>* %b
+  %3 = fadd <64 x double> %1, %2
+  %4 = fsub <64 x double> %1, %2
+  %5 = fptosi <64 x double> %3 to <64 x i32>
+  %6 = fptosi <64 x double> %4 to <64 x i32>
+  %7 = or <64 x i32> %5, %6
+  store <64 x i32> %7, <64 x i32>* %c
+  ret void
+}
+
+define void @stack_fold_cvttps2dq(<128 x float>* %a, <128 x float>* %b, <128 x i32>* %c) #0 {
+  ;CHECK-LABEL: stack_fold_cvttps2dq
+  ;CHECK:   vcvttps2dq {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+
+  %1 = load <128 x float>* %a
+  %2 = load <128 x float>* %b
+  %3 = fadd <128 x float> %1, %2
+  %4 = fsub <128 x float> %1, %2
+  %5 = fptosi <128 x float> %3 to <128 x i32>
+  %6 = fptosi <128 x float> %4 to <128 x i32>
+  %7 = or <128 x i32> %5, %6
+  store <128 x i32> %7, <128 x i32>* %c
+  ret void
+}
diff --git a/test/CodeGen/X86/avx2-blend.ll b/test/CodeGen/X86/avx2-blend.ll
deleted file mode 100644
index b02442b..0000000
--- a/test/CodeGen/X86/avx2-blend.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
-
-define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
-; CHECK-LABEL: constant_pblendvb_avx2:
-; CHECK: vmovdqa
-; CHECK: vpblendvb
-  %1 = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
-  ret <32 x i8> %1
-}
-
-declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
new file mode 100644
index 0000000..ac2c73b
--- /dev/null
+++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=core-avx2 -mattr=avx2 | FileCheck %s
+
+define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+  ; CHECK: vpblendw
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone
+
+
+define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: vpblendd
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+  ; CHECK: vpblendd
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
+  ; CHECK: vmpsadbw
+  %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone
+
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index ab3d591..84b22b7 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -158,21 +158,21 @@ define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) {
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
-  ; CHECK: vpslldq
-  %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
-  ; CHECK: vpslldq
-  %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
+
+
+define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
+  ; CHECK: vpslldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+  %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
+  ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
+  %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
 declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
 
 
@@ -254,21 +254,21 @@ define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) {
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
-  ; CHECK: vpsrldq
-  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
-  ; CHECK: vpsrldq
-  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
+
+
+define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
+  ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
+  ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
+  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
 declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
 
 
@@ -475,10 +475,10 @@ declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
 
 define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
   ; CHECK: vmpsadbw
-  %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+  %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
-declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
@@ -499,10 +499,10 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw
 
 define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
   ; CHECK: vpblendw
-  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
-declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone
 
 
 define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
@@ -706,18 +706,18 @@ declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind re
 
 define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
   ; CHECK: vpblendd
-  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1]
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
-declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind readnone
+declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
   ; CHECK: vpblendd
-  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
-declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone
+declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
 
 
 define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {
diff --git a/test/CodeGen/X86/avx2-palignr.ll b/test/CodeGen/X86/avx2-palignr.ll
deleted file mode 100644
index 83573dc..0000000
--- a/test/CodeGen/X86/avx2-palignr.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-define <8 x i32> @test1(<8 x i32> %A, <8 x i32> %B) nounwind {
-; CHECK-LABEL: test1:
-; CHECK: vpalignr $4
-  %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12>
-  ret <8 x i32> %C
-}
-
-define <8 x i32> @test2(<8 x i32> %A, <8 x i32> %B) nounwind {
-; CHECK-LABEL: test2:
-; CHECK: vpalignr $4
-  %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 undef, i32 12>
-  ret <8 x i32> %C
-}
-
-define <8 x i32> @test3(<8 x i32> %A, <8 x i32> %B) nounwind {
-; CHECK-LABEL: test3:
-; CHECK: vpalignr $4
-  %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 undef, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12>
-  ret <8 x i32> %C
-}
-;
-define <8 x i32> @test4(<8 x i32> %A, <8 x i32> %B) nounwind {
-; CHECK-LABEL: test4:
-; CHECK: vpalignr $8
-  %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 10, i32 11, i32 undef, i32 1, i32 14, i32 15, i32 4, i32 5>
-  ret <8 x i32> %C
-}
-
-define <16 x i16> @test5(<16 x i16> %A, <16 x i16> %B) nounwind {
-; CHECK-LABEL: test5:
-; CHECK: vpalignr $6
-  %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 3, i32 4, i32 undef, i32 6, i32 7, i32 16, i32 17, i32 18, i32 11, i32 12, i32 13, i32 undef, i32 15, i32 24, i32 25, i32 26>
-  ret <16 x i16> %C
-}
-
-define <16 x i16> @test6(<16 x i16> %A, <16 x i16> %B) nounwind {
-; CHECK-LABEL: test6:
-; CHECK: vpalignr $6
-  %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 11, i32 12, i32 13, i32 undef, i32 15, i32 24, i32 25, i32 26>
-  ret <16 x i16> %C
-}
-
-define <16 x i16> @test7(<16 x i16> %A, <16 x i16> %B) nounwind {
-; CHECK-LABEL: test7:
-; CHECK: vpalignr $6
-  %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <16 x i16> %C
-}
-
-define <32 x i8> @test8(<32 x i8> %A, <32 x i8> %B) nounwind {
-; CHECK-LABEL: test8:
-; CHECK: vpalignr $5
-  %C = shufflevector <32 x i8> %A, <32 x i8> %B, <32 x i32> <i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52>
-  ret <32 x i8> %C
-}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
deleted file mode 100644
index 185b989..0000000
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ /dev/null
@@ -1,127 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-; Make sure that we don't match this shuffle using the vpblendw YMM instruction.
-; The mask for the vpblendw instruction needs to be identical for both halves
-; of the YMM. Need to use two vpblendw instructions.
-
-; CHECK: vpblendw_test1
-; mask = 10010110,b = 150,d
-; CHECK: vpblendw  $150, %ymm
-; CHECK: ret
-define <16 x i16> @vpblendw_test1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
-  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3,  i32 20, i32 5,  i32 6,  i32 23, 
-                                                               i32 8, i32 25, i32 26, i32 11, i32 28, i32 13, i32 14, i32 31>
-  ret <16 x i16> %t
-}
-
-; CHECK: vpblendw_test2
-; mask1 = 00010110 = 22
-; mask2 = 10000000 = 128
-; CHECK: vpblendw  $128, %xmm
-; CHECK: vpblendw  $22, %xmm
-; CHECK: vinserti128
-; CHECK: ret
-define <16 x i16> @vpblendw_test2(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
-  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, 
-                                                               i32 8, i32 9,  i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
-  ret <16 x i16> %t
-}
-
-; CHECK: blend_test1
-; CHECK: vpblendd
-; CHECK: ret
-define <8 x i32> @blend_test1(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline {
-  %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
-  ret <8 x i32> %t
-}
-
-; CHECK: blend_test2
-; CHECK: vpblendd
-; CHECK: ret
-define <8 x i32> @blend_test2(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline {
-  %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
-  ret <8 x i32> %t
-}
-
-
-; CHECK: blend_test3
-; CHECK: vblendps
-; CHECK: ret
-define <8 x float> @blend_test3(<8 x float> %a, <8 x float> %b) nounwind alwaysinline {
-  %t = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
-  ret <8 x float> %t
-}
-
-; CHECK: blend_test4
-; CHECK: vblendpd
-; CHECK: ret
-define <4 x i64> @blend_test4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
-  %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-  ret <4 x i64> %t
-}
-
-;; 2 tests for shufflevectors that optimize to blend + immediate
-; CHECK-LABEL: @blend_test5
-; CHECK: vpblendd $10, %xmm1, %xmm0, %xmm0
-; CHECK: ret
-define <4 x i32> @blend_test5(<4 x i32> %a, <4 x i32> %b) {
-  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x i32> %1
-}
-
-; CHECK-LABEL: @blend_test6
-; CHECK: vpblendw $134, %ymm1, %ymm0, %ymm0
-; CHECK: ret
-define <16 x i16> @blend_test6(<16 x i16> %a, <16 x i16> %b) {
-  %1 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32  3, i32  4, i32  5, i32  6, i32 23,
-                                                               i32 8, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 31>
-  ret <16 x i16> %1
-}
-
-; CHECK: vpshufhw $27, %ymm
-define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
-  ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpshuflw $27, %ymm
-define <16 x i16> @vpshuflw(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpshufb_test
-; CHECK: vpshufb {{.*\(%r.*}}, %ymm
-; CHECK: ret
-define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
-  %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
-                                                                i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,  
-                                                                i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
-                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
-  ret <32 x i8>%S
-}
-
-; CHECK: vpshufb1_test
-; CHECK: vpshufb {{.*\(%r.*}}, %ymm
-; CHECK: ret
-define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind {
-  %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
-                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
-                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
-                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
-  ret <32 x i8>%S
-}
-
-
-; CHECK: vpshufb2_test
-; CHECK: vpshufb {{.*\(%r.*}}, %ymm
-; CHECK: ret
-define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind {
-  %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
-                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
-                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
-                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
-  ret <32 x i8>%S
-}
diff --git a/test/CodeGen/X86/avx2-unpack.ll b/test/CodeGen/X86/avx2-unpack.ll
deleted file mode 100644
index 6d17443..0000000
--- a/test/CodeGen/X86/avx2-unpack.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-; CHECK: vpunpckhdq
-define <8 x i32> @unpackhidq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  ret <8 x i32> %shuffle.i
-}
-
-; CHECK: vpunpckhqdq
-define <4 x i64> @unpackhiqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  ret <4 x i64> %shuffle.i
-}
-
-; CHECK: vpunpckldq
-define <8 x i32> @unpacklodq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  ret <8 x i32> %shuffle.i
-}
-
-; CHECK: vpunpcklqdq
-define <4 x i64> @unpacklqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  ret <4 x i64> %shuffle.i
-}
-
-; CHECK: vpunpckhwd
-define <16 x i16> @unpackhwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpunpcklwd
-define <16 x i16> @unpacklwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
-  ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpunpckhbw
-define <32 x i8> @unpackhbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
-  ret <32 x i8> %shuffle.i
-}
-
-; CHECK: vpunpcklbw
-define <32 x i8> @unpacklbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
-  ret <32 x i8> %shuffle.i
-}
-
-; CHECK: vpunpckhdq
-define <8 x i32> @unpackhidq1_undef(<8 x i32> %src1) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  ret <8 x i32> %shuffle.i
-}
-
-; CHECK: vpunpckhqdq
-define <4 x i64> @unpackhiqdq1_undef(<4 x i64> %src1) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  ret <4 x i64> %shuffle.i
-}
-
-; CHECK: vpunpckhwd
-define <16 x i16> @unpackhwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpunpcklwd
-define <16 x i16> @unpacklwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
-  ret <16 x i16> %shuffle.i
-}
-
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 66f586d..924c06e 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -317,7 +317,7 @@ define   <4 x double> @_inreg4xdouble(<4 x double> %a) {
 }
 
 ;CHECK-LABEL: _inreg2xdouble:
-;CHECK: vpbroadcastq
+;CHECK: vunpcklpd
 ;CHECK: ret
 define   <2 x double> @_inreg2xdouble(<2 x double> %a) {
   %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
diff --git a/test/CodeGen/X86/avx2-vperm2i128.ll b/test/CodeGen/X86/avx2-vperm2i128.ll
deleted file mode 100644
index 1937db5..0000000
--- a/test/CodeGen/X86/avx2-vperm2i128.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-; CHECK: vperm2i128 $17
-define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
-entry:
-  ; add forces execution domain
-  %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-  %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ret <32 x i8> %shuffle
-}
-
-; CHECK: vperm2i128 $3
-define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  ; add forces execution domain
-  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
-  %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: vperm2i128 $49
-define <8 x i32> @E3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
-entry:
-  ; add forces execution domain
-  %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i32> %shuffle
-}
-
-; CHECK: vperm2i128 $2
-define <16 x i16> @E4(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
-entry:
-  ; add forces execution domain
-  %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-  %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <16 x i16> %shuffle
-}
-
-; CHECK: vperm2i128 $2, (%
-define <16 x i16> @E5(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
-entry:
-  %c = load <16 x i16>* %a
-  %d = load <16 x i16>* %b
-  %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-  %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <16 x i16> %shuffle
-}
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index 4d1c9f7..c43da9c 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -1,189 +1,217 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-; CHECK-LABEL: addpd512
-; CHECK: vaddpd
-; CHECK: ret
 define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: addpd512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %add.i = fadd <8 x double> %x, %y
   ret <8 x double> %add.i
 }
 
-; CHECK-LABEL: addpd512fold
-; CHECK: vaddpd LCP{{.*}}(%rip)
-; CHECK: ret
 define <8 x double> @addpd512fold(<8 x double> %y) {
+; CHECK-LABEL: addpd512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vaddpd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %add.i = fadd <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.800000e+00, double 2.300000e+00, double 1.200000e+00>
   ret <8 x double> %add.i
 }
 
-; CHECK-LABEL: addps512
-; CHECK: vaddps
-; CHECK: ret
 define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: addps512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %add.i = fadd <16 x float> %x, %y
   ret <16 x float> %add.i
 }
 
-; CHECK-LABEL: addps512fold
-; CHECK: vaddps LCP{{.*}}(%rip)
-; CHECK: ret
 define <16 x float> @addps512fold(<16 x float> %y) {
+; CHECK-LABEL: addps512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vaddps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %add.i = fadd <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 4.500000e+00, float 4.500000e+00, float 0x400B333340000000,  float 0x4002666660000000, float 0x3FF3333340000000>
   ret <16 x float> %add.i
 }
 
-; CHECK-LABEL: subpd512
-; CHECK: vsubpd
-; CHECK: ret
 define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: subpd512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsubpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %sub.i = fsub <8 x double> %x, %y
   ret <8 x double> %sub.i
 }
 
-; CHECK-LABEL: @subpd512fold
-; CHECK: vsubpd (%
-; CHECK: ret
 define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) {
+; CHECK-LABEL: subpd512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsubpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %tmp2 = load <8 x double>* %x, align 8
   %sub.i = fsub <8 x double> %y, %tmp2
   ret <8 x double> %sub.i
 }
 
-; CHECK-LABEL: @subps512
-; CHECK: vsubps
-; CHECK: ret
 define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: subps512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsubps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %sub.i = fsub <16 x float> %x, %y
   ret <16 x float> %sub.i
 }
 
-; CHECK-LABEL: subps512fold
-; CHECK: vsubps (%
-; CHECK: ret
 define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) {
+; CHECK-LABEL: subps512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsubps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %tmp2 = load <16 x float>* %x, align 4
   %sub.i = fsub <16 x float> %y, %tmp2
   ret <16 x float> %sub.i
 }
 
-; CHECK-LABEL: imulq512
-; CHECK: vpmuludq
-; CHECK: vpmuludq
-; CHECK: ret
 define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
+; CHECK-LABEL: imulq512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
+; CHECK-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; CHECK-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
+; CHECK-NEXT:    vpsllq $32, %zmm3, %zmm3
+; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; CHECK-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpsllq $32, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %z = mul <8 x i64>%x, %y
   ret <8 x i64>%z
 }
 
-; CHECK-LABEL: mulpd512
-; CHECK: vmulpd
-; CHECK: ret
 define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: mulpd512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %mul.i = fmul <8 x double> %x, %y
   ret <8 x double> %mul.i
 }
 
-; CHECK-LABEL: mulpd512fold
-; CHECK: vmulpd LCP{{.*}}(%rip)
-; CHECK: ret
 define <8 x double> @mulpd512fold(<8 x double> %y) {
+; CHECK-LABEL: mulpd512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmulpd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %mul.i = fmul <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
   ret <8 x double> %mul.i
 }
 
-; CHECK-LABEL: mulps512
-; CHECK: vmulps
-; CHECK: ret
 define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: mulps512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmulps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %mul.i = fmul <16 x float> %x, %y
   ret <16 x float> %mul.i
 }
 
-; CHECK-LABEL: mulps512fold
-; CHECK: vmulps LCP{{.*}}(%rip)
-; CHECK: ret
 define <16 x float> @mulps512fold(<16 x float> %y) {
+; CHECK-LABEL: mulps512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %mul.i = fmul <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
   ret <16 x float> %mul.i
 }
 
-; CHECK-LABEL: divpd512
-; CHECK: vdivpd
-; CHECK: ret
 define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: divpd512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %div.i = fdiv <8 x double> %x, %y
   ret <8 x double> %div.i
 }
 
-; CHECK-LABEL: divpd512fold
-; CHECK: vdivpd LCP{{.*}}(%rip)
-; CHECK: ret
 define <8 x double> @divpd512fold(<8 x double> %y) {
+; CHECK-LABEL: divpd512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vdivpd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %div.i = fdiv <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
   ret <8 x double> %div.i
 }
 
-; CHECK-LABEL: divps512
-; CHECK: vdivps
-; CHECK: ret
 define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: divps512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vdivps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %div.i = fdiv <16 x float> %x, %y
   ret <16 x float> %div.i
 }
 
-; CHECK-LABEL: divps512fold
-; CHECK: vdivps LCP{{.*}}(%rip)
-; CHECK: ret
 define <16 x float> @divps512fold(<16 x float> %y) {
+; CHECK-LABEL: divps512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vdivps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %div.i = fdiv <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000>
   ret <16 x float> %div.i
 }
 
-; CHECK-LABEL: vpaddq_test
-; CHECK: vpaddq %zmm
-; CHECK: ret
 define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+; CHECK-LABEL: vpaddq_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = add <8 x i64> %i, %j
   ret <8 x i64> %x
 }
 
-; CHECK-LABEL: vpaddq_fold_test
-; CHECK: vpaddq (%
-; CHECK: ret
 define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind {
+; CHECK-LABEL: vpaddq_fold_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %tmp = load <8 x i64>* %j, align 4
   %x = add <8 x i64> %i, %tmp
   ret <8 x i64> %x
 }
 
-; CHECK-LABEL: vpaddq_broadcast_test
-; CHECK: vpaddq LCP{{.*}}(%rip){1to8}
-; CHECK: ret
 define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind {
+; CHECK-LABEL: vpaddq_broadcast_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = add <8 x i64> %i, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
   ret <8 x i64> %x
 }
 
-; CHECK-LABEL: vpaddq_broadcast2_test
-; CHECK: vpaddq (%rdi){1to8}
-; CHECK: ret
 define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
+; CHECK-LABEL: vpaddq_broadcast2_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %tmp = load i64* %j
   %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0
   %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1
@@ -197,55 +225,67 @@ define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
   ret <8 x i64> %x
 }
 
-; CHECK-LABEL: vpaddd_test
-; CHECK: vpaddd %zmm
-; CHECK: ret
 define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+; CHECK-LABEL: vpaddd_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = add <16 x i32> %i, %j
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: vpaddd_fold_test
-; CHECK: vpaddd (%
-; CHECK: ret
 define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind {
+; CHECK-LABEL: vpaddd_fold_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %tmp = load <16 x i32>* %j, align 4
   %x = add <16 x i32> %i, %tmp
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: vpaddd_broadcast_test
-; CHECK: vpaddd LCP{{.*}}(%rip){1to16}
-; CHECK: ret
 define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
+; CHECK-LABEL: vpaddd_broadcast_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: vpaddd_mask_test
-; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
-; CHECK: ret
 define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_mask_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %x = add <16 x i32> %i, %j
   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
   ret <16 x i32> %r
 }
 
-; CHECK-LABEL: vpaddd_maskz_test
-; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z} }}
-; CHECK: ret
 define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_maskz_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %x = add <16 x i32> %i, %j
   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
   ret <16 x i32> %r
 }
 
-; CHECK-LABEL: vpaddd_mask_fold_test
-; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
-; CHECK: ret
 define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_mask_fold_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %j = load <16 x i32>* %j.ptr
   %x = add <16 x i32> %i, %j
@@ -253,20 +293,26 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16
   ret <16 x i32> %r
 }
 
-; CHECK-LABEL: vpaddd_mask_broadcast_test
-; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
-; CHECK: ret
 define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_mask_broadcast_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
   ret <16 x i32> %r
 }
 
-; CHECK-LABEL: vpaddd_maskz_fold_test
-; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z}
-; CHECK: ret
 define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_maskz_fold_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %j = load <16 x i32>* %j.ptr
   %x = add <16 x i32> %i, %j
@@ -274,125 +320,141 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16
   ret <16 x i32> %r
 }
 
-; CHECK-LABEL: vpaddd_maskz_broadcast_test
-; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z}
-; CHECK: ret
 define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_maskz_broadcast_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
   ret <16 x i32> %r
 }
 
-; CHECK-LABEL: vpsubq_test
-; CHECK: vpsubq %zmm
-; CHECK: ret
 define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+; CHECK-LABEL: vpsubq_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = sub <8 x i64> %i, %j
   ret <8 x i64> %x
 }
 
-; CHECK-LABEL: vpsubd_test
-; CHECK: vpsubd
-; CHECK: ret
 define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+; CHECK-LABEL: vpsubd_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = sub <16 x i32> %i, %j
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: vpmulld_test
-; CHECK: vpmulld %zmm
-; CHECK: ret
 define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
+; CHECK-LABEL: vpmulld_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = mul <16 x i32> %i, %j
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: sqrtA
-; CHECK: vsqrtss {{.*}} encoding: [0x62
-; CHECK: ret
 declare float @sqrtf(float) readnone
 define float @sqrtA(float %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: sqrtA:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %conv1 = tail call float @sqrtf(float %a) nounwind readnone
   ret float %conv1
 }
 
-; CHECK-LABEL: sqrtB
-; CHECK: vsqrtsd {{.*}}## encoding: [0x62
-; CHECK: ret
 declare double @sqrt(double) readnone
 define double @sqrtB(double %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: sqrtB:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %call = tail call double @sqrt(double %a) nounwind readnone
   ret double %call
 }
 
-; CHECK-LABEL: sqrtC
-; CHECK: vsqrtss {{.*}}## encoding: [0x62
-; CHECK: ret
 declare float @llvm.sqrt.f32(float)
 define float @sqrtC(float %a) nounwind {
+; CHECK-LABEL: sqrtC:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %b = call float @llvm.sqrt.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: sqrtD
-; CHECK: vsqrtps {{.*}}
-; CHECK: ret
 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
 define <16 x float> @sqrtD(<16 x float> %a) nounwind {
+; CHECK-LABEL: sqrtD:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
   ret <16 x float> %b
 }
 
-; CHECK-LABEL: sqrtE
-; CHECK: vsqrtpd {{.*}}
-; CHECK: ret
 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
 define <8 x double> @sqrtE(<8 x double> %a) nounwind {
+; CHECK-LABEL: sqrtE:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
   ret <8 x double> %b
 }
 
-; CHECK-LABEL: fadd_broadcast
-; CHECK: LCP{{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK: ret
 define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind {
+; CHECK-LABEL: fadd_broadcast:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
   ret <16 x float> %b
 }
 
-; CHECK-LABEL: addq_broadcast
-; CHECK: vpaddq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK: ret
 define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind {
+; CHECK-LABEL: addq_broadcast:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
   ret <8 x i64> %b
 }
 
-; CHECK-LABEL: orq_broadcast
-; CHECK: vporq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK: ret
 define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
+; CHECK-LABEL: orq_broadcast:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
   ret <8 x i64> %b
 }
 
-; CHECK-LABEL: andd512fold
-; CHECK: vpandd (%
-; CHECK: ret
 define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
+; CHECK-LABEL: andd512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpandd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %a = load <16 x i32>* %x, align 4
   %b = and <16 x i32> %y, %a
   ret <16 x i32> %b
 }
 
-; CHECK-LABEL: andqbrst
-; CHECK: vpandq  (%rdi){1to8}, %zmm
-; CHECK: ret
 define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
+; CHECK-LABEL: andqbrst:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpandq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %a = load i64* %ap, align 8
   %b = insertelement <8 x i64> undef, i64 %a, i32 0
diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll
index b5a2aa8..9e9ad31 100644
--- a/test/CodeGen/X86/avx512-build-vector.ll
+++ b/test/CodeGen/X86/avx512-build-vector.ll
@@ -1,30 +1,43 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-; CHECK-LABEL: test1
-; CHECK: vpxord
-; CHECK: ret
 define <16 x i32> @test1(i32* %x) {
+; CHECK-LABEL: test1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovd (%rdi), %xmm0
+; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
    %y = load i32* %x, align 4
    %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4
    ret <16 x i32>%res
 }
 
-; CHECK-LABEL: test2
-; CHECK: vpaddd LCP{{.*}}(%rip){1to16}
-; CHECK: ret
 define <16 x i32> @test2(<16 x i32> %x) {
+; CHECK-LABEL: test2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
    %res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x
    ret <16 x i32>%res
 }
 
-; CHECK-LABEL: test3
-; CHECK: vinsertf128
-; CHECK: vinsertf64x4
-; CHECK: ret
 define <16 x float> @test3(<4 x float> %a) {
+; CHECK-LABEL: test3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vmovss %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vmovss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[1,0],xmm0[0,1]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; CHECK-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = extractelement <4 x float> %a, i32 2
   %c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
   %b1 = extractelement <4 x float> %a, i32 0
   %c1 = insertelement <16 x float> %c, float %b1, i32 6
   ret <16 x float>%c1
-}
-\ No newline at end of file
+}
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index 47e50a9..6e0d185 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -28,10 +28,9 @@ l2:
   ret float %c1
 }
 
+; FIXME: Can use vcmpeqss and extract from the mask here in AVX512.
 ; CHECK-LABEL: test3
-; CHECK: vcmpeqss
-; CHECK: kmov
-; CHECK: ret
+; CHECK: vucomiss {{.*}}encoding: [0x62
 define i32 @test3(float %a, float %b) {
 
   %cmp10.i = fcmp oeq float %a, %b
@@ -86,3 +85,17 @@ define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
   %res = select i1 %tmp5, i32 1, i32 %a3
   ret i32 %res
 }
+
+; CHECK-LABEL: test9
+; CHECK: testb
+; CHECK-NOT: kmov
+; CHECK: ret
+define i32 @test9(i64 %a) {
+ %b = and i64 %a, 1
+ %cmp10.i = icmp eq i64 %b, 0
+ br i1 %cmp10.i, label %A, label %B
+A:
+ ret i32 6
+B:
+ ret i32 7
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index f5cda96..2b672a7 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -255,3 +255,56 @@ define double @uitofp03(i32 %a) nounwind {
   %b = uitofp i32 %a to double
   ret double %b
 }
+
+; CHECK-LABEL: @sitofp_16i1_float
+; CHECK: vpbroadcastd
+; CHECK: vcvtdq2ps
+define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
+  %mask = icmp slt <16 x i32> %a, zeroinitializer
+  %1 = sitofp <16 x i1> %mask to <16 x float>
+  ret <16 x float> %1
+}
+
+; CHECK-LABEL: @sitofp_16i8_float
+; CHECK: vpmovsxbd
+; CHECK: vcvtdq2ps
+define <16 x float> @sitofp_16i8_float(<16 x i8> %a) {
+  %1 = sitofp <16 x i8> %a to <16 x float>
+  ret <16 x float> %1
+}
+
+; CHECK-LABEL: @sitofp_16i16_float
+; CHECK: vpmovsxwd
+; CHECK: vcvtdq2ps
+define <16 x float> @sitofp_16i16_float(<16 x i16> %a) {
+  %1 = sitofp <16 x i16> %a to <16 x float>
+  ret <16 x float> %1
+}
+
+; CHECK-LABEL: @sitofp_8i16_double
+; CHECK: vpmovsxwd
+; CHECK: vcvtdq2pd
+define <8 x double> @sitofp_8i16_double(<8 x i16> %a) {
+  %1 = sitofp <8 x i16> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+; CHECK-LABEL: sitofp_8i8_double
+; CHECK: vpmovzxwd
+; CHECK: vpslld
+; CHECK: vpsrad
+; CHECK: vcvtdq2pd
+define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
+  %1 = sitofp <8 x i8> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+
+; CHECK-LABEL: @sitofp_8i1_double
+; CHECK: vpbroadcastq
+; CHECK: vcvtdq2pd
+define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
+  %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
+  %1 = sitofp <8 x i1> %cmpres to <8 x double>
+  ret <8 x double> %1
+}
diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll
index ce3d759..366d324 100644
--- a/test/CodeGen/X86/avx512-fma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -1,97 +1,113 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
 
 define <16 x float> @test_x86_vfmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfmadd_ps_z
   ; CHECK: vfmadd213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmadd_pd_z
   ; CHECK: vfmadd213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <8 x double> @test_mask_fmadd_pd(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: test_mask_fmadd_pd:
+; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
 
 define <16 x float> @test_x86_vfmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubps_z
   ; CHECK: vfmsub213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubpd_z
   ; CHECK: vfmsub213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfnmadd_ps_z
   ; CHECK: vfnmadd213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfnmadd_pd_z
   ; CHECK: vfnmadd213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfnmsubps_z
   ; CHECK: vfnmsub213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfnmsubpd_z
   ; CHECK: vfnmsub213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfmaddsubps_z
   ; CHECK: vfmaddsub213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: test_mask_fmaddsub_ps:
+; CHECK: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa6,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmaddsubpd_z
   ; CHECK: vfmaddsub213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubaddps_z
   ; CHECK: vfmsubadd213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubaddpd_z
   ; CHECK: vfmsubadd213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index b360c71..eba895e 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL --check-prefix=CHECK %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=CHECK %s
 
 ;CHECK-LABEL: test1:
 ;CHECK: vinsertps
@@ -12,9 +13,11 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
 }
 
 ;CHECK-LABEL: test2:
-;CHECK: vinsertf32x4
-;CHECK: vextractf32x4
-;CHECK: vinsertf32x4
+;KNL: vinsertf32x4 $0
+;SKX: vinsertf64x2 $0
+;CHECK: vextractf32x4 $3
+;KNL: vinsertf32x4 $3
+;SKX: vinsertf64x2 $3
 ;CHECK: ret
 define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
   %rrr = load double* %br
@@ -24,8 +27,8 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
 }
 
 ;CHECK-LABEL: test3:
-;CHECK: vextractf32x4
-;CHECK: vinsertf32x4
+;CHECK: vextractf32x4 $1
+;CHECK: vinsertf32x4 $0
 ;CHECK: ret
 define <16 x float> @test3(<16 x float> %x) nounwind {
   %eee = extractelement <16 x float> %x, i32 4
@@ -34,8 +37,9 @@ define <16 x float> @test3(<16 x float> %x) nounwind {
 }
 
 ;CHECK-LABEL: test4:
-;CHECK: vextracti32x4
-;CHECK: vinserti32x4
+;CHECK: vextracti32x4 $2
+;KNL: vinserti32x4 $0
+;SKX: vinserti64x2 $0
 ;CHECK: ret
 define <8 x i64> @test4(<8 x i64> %x) nounwind {
   %eee = extractelement <8 x i64> %x, i32 4
@@ -186,12 +190,13 @@ define i16 @test16(i1 *%addr, i16 %a) {
 ;CHECK-LABEL: test17
 ;CHECK: kshiftlw
 ;CHECK: kshiftrw
-;CHECK: korw
+;KNL: korw
+;SKX: korb
 ;CHECK: ret
 define i8 @test17(i1 *%addr, i8 %a) {
   %x = load i1 * %addr, align 128
   %a1 = bitcast i8 %a to <8 x i1>
-  %x1 = insertelement <8 x i1> %a1, i1 %x, i32 10
+  %x1 = insertelement <8 x i1> %a1, i1 %x, i32 4
   %x2 = bitcast <8 x i1>%x1 to i8
   ret i8 %x2
 }
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 18cfcfe..691d1fb 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -60,20 +60,6 @@ define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
 }
 declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
 
-define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
-  ; CHECK: vrcp28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
-  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1]
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
-  ; CHECK: vrcp28pd {sae}, {{.*}}encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
-  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) ; <<8 x double>> [#uses=1]
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
-
 declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
 
 define <8 x double> @test7(<8 x double> %a) {
@@ -97,13 +83,6 @@ define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
 }
 declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
 
-define <16 x float> @test_rsqrt28_ps_512(<16 x float> %a0) {
-  ; CHECK: vrsqrt28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1]
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
 define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
   ; CHECK: vrsqrt14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4f,0xc0]
   %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
@@ -111,13 +90,6 @@ define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
 }
 declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
 
-define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
-  ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
 define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
   ; CHECK: vrcp14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4d,0xc0]
   %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
@@ -125,26 +97,19 @@ define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
 }
 declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
 
-define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
-  ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
-  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
 define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
   ; CHECK: vsqrtpd
-  %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+  %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4) ; <<8 x double>> [#uses=1]
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
   ; CHECK: vsqrtps
-  %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vsqrtss {{.*}}encoding: [0x62
@@ -611,3 +576,515 @@ define <8 x i64> @test_vmovntdqa(i8 *%x) {
 }
 
 declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*)
+
+define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_valign_q:
+; CHECK: valignq $2, %zmm1, %zmm0, %zmm0
+  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i8 2, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
+; CHECK-LABEL: test_mask_valign_q:
+; CHECK: valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i8 2, <8 x i64> %src, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i8, <8 x i64>, i8)
+
+define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_maskz_valign_d:
+; CHECK: valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x03,0xc1,0x05]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i8 5, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i8, <16 x i32>, i16)
+
+define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
+ ; CHECK-LABEL: test_mask_store_ss
+ ; CHECK: vmovss %xmm0, (%rdi) {%k1}     ## encoding: [0x62,0xf1,0x7e,0x09,0x11,0x07]
+ call void @llvm.x86.avx512.mask.store.ss(i8* %ptr, <4 x float> %data, i8 %mask)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 )
+
+define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d
+; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d
+; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
+
+define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q
+; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q
+; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
+
+define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d
+; CHECK: vpcmpgtd %zmm1, %zmm0, %k0 ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d
+; CHECK: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
+
+define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q
+; CHECK: vpcmpgtq %zmm1, %zmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q
+; CHECK: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
+
+define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK_LABEL: test_cmp_d_512
+; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ##
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltd %zmm1, %zmm0, %k0 ##
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpled %zmm1, %zmm0, %k0 ##
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 ##
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 ##
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 ##
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnled %zmm1, %zmm0, %k0 ##
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordd %zmm1, %zmm0, %k0 ##
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_cmp_d_512
+; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltd %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpled %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnled %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordd %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
+
+define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK_LABEL: test_ucmp_d_512
+; CHECK: vpcmpequd %zmm1, %zmm0, %k0 ##
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltud %zmm1, %zmm0, %k0 ##
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleud %zmm1, %zmm0, %k0 ##
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 ##
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 ##
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 ##
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 ##
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordud %zmm1, %zmm0, %k0 ##
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_ucmp_d_512
+; CHECK: vpcmpequd %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordud %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
+
+define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK_LABEL: test_cmp_q_512
+; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %zmm1, %zmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %zmm1, %zmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %zmm1, %zmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_q_512
+; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK_LABEL: test_ucmp_q_512
+; CHECK: vpcmpequq %zmm1, %zmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %zmm1, %zmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_q_512
+; CHECK: vpcmpequq %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
+
+define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
+; CHECK-LABEL: test_mask_vextractf32x4:
+; CHECK: vextractf32x4 $2, %zmm1, %xmm0 {%k1}
+  %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i8 2, <4 x float> %b, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i8, <4 x float>, i8)
+
+define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: test_mask_vextracti64x4:
+; CHECK: vextracti64x4 $2, %zmm1, %ymm0 {%k1}
+  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i8 2, <4 x i64> %b, i8 %mask)
+  ret <4 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i8, <4 x i64>, i8)
+
+define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: test_maskz_vextracti32x4:
+; CHECK: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
+  %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i8 2, <4 x i32> zeroinitializer, i8 %mask)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i8, <4 x i32>, i8)
+
+define <4 x double> @test_vextractf64x4(<8 x double> %a) {
+; CHECK-LABEL: test_vextractf64x4:
+; CHECK: vextractf64x4 $2, %zmm0, %ymm0 ##
+  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i8 2, <4 x double> zeroinitializer, i8 -1)
+  ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i8, <4 x double>, i8)
+
+define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
+  ; CHECK-LABEL: test_x86_avx512_pslli_d
+  ; CHECK: vpslld
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_pslli_d 
+  ; CHECK: vpslld $7, %zmm0, %zmm1 {%k1}  
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_pslli_d
+  ; CHECK: vpslld $7, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
+  ; CHECK-LABEL: test_x86_avx512_pslli_q
+  ; CHECK: vpsllq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_pslli_q
+  ; CHECK: vpsllq $7, %zmm0, %zmm1 {%k1}   
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q
+  ; CHECK: vpsllq $7, %zmm0, %zmm0 {%k1} {z} 
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
+  ; CHECK-LABEL: test_x86_avx512_psrli_d
+  ; CHECK: vpsrld
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrli_d
+  ; CHECK: vpsrld $7, %zmm0, %zmm1 {%k1}  
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrli_d
+  ; CHECK: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
+  ; CHECK-LABEL: test_x86_avx512_psrli_q
+  ; CHECK: vpsrlq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrli_q
+  ; CHECK: vpsrlq $7, %zmm0, %zmm1 {%k1}  
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q
+  ; CHECK: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
+  ; CHECK-LABEL: test_x86_avx512_psrai_d
+  ; CHECK: vpsrad
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrai_d
+  ; CHECK: vpsrad $7, %zmm0, %zmm1 {%k1}  
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrai_d
+  ; CHECK: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
+  ; CHECK-LABEL: test_x86_avx512_psrai_q
+  ; CHECK: vpsraq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrai_q
+  ; CHECK: vpsraq $7, %zmm0, %zmm1 {%k1}   
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q
+  ; CHECK: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index dd33ffd..35d3348 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -1,12 +1,14 @@
-; RUN: llc < %s -march=x86-64 -mcpu=knl | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
 define i16 @mask16(i16 %x) {
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %ret = bitcast <16 x i1> %m1 to i16
   ret i16 %ret
-; CHECK: mask16
-; CHECK: knotw
+; CHECK-LABEL: mask16
+; CHECK: kmovw
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw
 ; CHECK: ret
 }
 
@@ -15,8 +17,38 @@ define i8 @mask8(i8 %x) {
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %ret = bitcast <8 x i1> %m1 to i8
   ret i8 %ret
-; CHECK: mask8
-; CHECK: knotw
+; CHECK-LABEL: mask8
+; CHECK: kmovw
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw
+; CHECK: ret
+}
+
+define void @mask16_mem(i16* %ptr) {
+  %x = load i16* %ptr, align 4
+  %m0 = bitcast i16 %x to <16 x i1>
+  %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <16 x i1> %m1 to i16
+  store i16 %ret, i16* %ptr, align 4
+  ret void
+; CHECK-LABEL: mask16_mem
+; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
+; CHECK: ret
+}
+
+define void @mask8_mem(i8* %ptr) {
+  %x = load i8* %ptr, align 4
+  %m0 = bitcast i8 %x to <8 x i1>
+  %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <8 x i1> %m1 to i8
+  store i8 %ret, i8* %ptr, align 4
+  ret void
+; CHECK-LABEL: mask8_mem
+; CHECK: kmovw ([[ARG1]]), %k{{[0-7]}}
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 009802f..93875e8 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -153,31 +153,295 @@ define void @test18(i8 * %addr, <8 x i64> %data) {
   ret void
 }
 
-; CHECK-LABEL: store_i1_1
-; CHECK: movb
-; CHECK: movb
+; CHECK-LABEL: test19
+; CHECK: vmovdqu32
+; CHECK: ret
+define void @test19(i8 * %addr, <16 x i32> %data) {
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  store <16 x i32>%data, <16 x i32>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test20
+; CHECK: vmovdqa32
+; CHECK: ret
+define void @test20(i8 * %addr, <16 x i32> %data) {
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  store <16 x i32>%data, <16 x i32>* %vaddr, align 64
+  ret void
+}
+
+; CHECK-LABEL: test21
+; CHECK: vmovdqa64
 ; CHECK: ret
-define void @store_i1_1() {
-  store i1 true, i1 addrspace(3)* undef, align 128
-  store i1 false, i1 addrspace(2)* undef, align 128
+define  <8 x i64> @test21(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %res = load <8 x i64>* %vaddr, align 64
+  ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test22
+; CHECK: vmovdqu64
+; CHECK: ret
+define void @test22(i8 * %addr, <8 x i64> %data) {
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  store <8 x i64>%data, <8 x i64>* %vaddr, align 1
   ret void
 }
 
-; CHECK-LABEL: store_i1_2
-; CHECK: movb
+; CHECK-LABEL: test23
+; CHECK: vmovdqu64
 ; CHECK: ret
-define void @store_i1_2(i64 %a, i64 %b) {
-  %res = icmp eq i64 %a, %b
-  store i1 %res, i1 addrspace(3)* undef, align 128
+define <8 x i64> @test23(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %res = load <8 x i64>* %vaddr, align 1
+  ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test24
+; CHECK: vmovapd
+; CHECK: ret
+define void @test24(i8 * %addr, <8 x double> %data) {
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  store <8 x double>%data, <8 x double>* %vaddr, align 64
   ret void
 }
 
-; CHECK-LABEL: store_i1_3
-; CHECK: kmovw
+; CHECK-LABEL: test25
+; CHECK: vmovapd
+; CHECK: ret
+define <8 x double> @test25(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %res = load <8 x double>* %vaddr, align 64
+  ret <8 x double>%res
+}
+
+; CHECK-LABEL: test26
+; CHECK: vmovaps
 ; CHECK: ret
-define void @store_i1_3(i16 %a) {
-  %a_vec = bitcast i16 %a to <16 x i1>
-  %res = extractelement <16 x i1> %a_vec, i32 4
-  store i1 %res, i1 addrspace(3)* undef, align 128
+define void @test26(i8 * %addr, <16 x float> %data) {
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  store <16 x float>%data, <16 x float>* %vaddr, align 64
   ret void
 }
+
+; CHECK-LABEL: test27
+; CHECK: vmovaps
+; CHECK: ret
+define <16 x float> @test27(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %res = load <16 x float>* %vaddr, align 64
+  ret <16 x float>%res
+}
+
+; CHECK-LABEL: test28
+; CHECK: vmovupd
+; CHECK: ret
+define void @test28(i8 * %addr, <8 x double> %data) {
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  store <8 x double>%data, <8 x double>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test29
+; CHECK: vmovupd
+; CHECK: ret
+define <8 x double> @test29(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %res = load <8 x double>* %vaddr, align 1
+  ret <8 x double>%res
+}
+
+; CHECK-LABEL: test30
+; CHECK: vmovups
+; CHECK: ret
+define void @test30(i8 * %addr, <16 x float> %data) {
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  store <16 x float>%data, <16 x float>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test31
+; CHECK: vmovups
+; CHECK: ret
+define <16 x float> @test31(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %res = load <16 x float>* %vaddr, align 1
+  ret <16 x float>%res
+}
+
+; CHECK-LABEL: test32
+; CHECK: vmovdqa32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %r = load <16 x i32>* %vaddr, align 64
+  %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test33
+; CHECK: vmovdqu32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %r = load <16 x i32>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test34
+; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %r = load <16 x i32>* %vaddr, align 64
+  %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test35
+; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %r = load <16 x i32>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test36
+; CHECK: vmovdqa64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %r = load <8 x i64>* %vaddr, align 64
+  %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old
+  ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test37
+; CHECK: vmovdqu64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %r = load <8 x i64>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old
+  ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test38
+; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %r = load <8 x i64>* %vaddr, align 64
+  %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer
+  ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test39
+; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %r = load <8 x i64>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer
+  ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test40
+; CHECK: vmovaps{{.*{%k[1-7]} }}
+; CHECK: ret
+define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+  %mask = fcmp one <16 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %r = load <16 x float>* %vaddr, align 64
+  %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old
+  ret <16 x float>%res
+}
+
+; CHECK-LABEL: test41
+; CHECK: vmovups{{.*{%k[1-7]} }}
+; CHECK: ret
+define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+  %mask = fcmp one <16 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %r = load <16 x float>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old
+  ret <16 x float>%res
+}
+
+; CHECK-LABEL: test42
+; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) {
+  %mask = fcmp one <16 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %r = load <16 x float>* %vaddr, align 64
+  %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer
+  ret <16 x float>%res
+}
+
+; CHECK-LABEL: test43
+; CHECK: vmovups{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) {
+  %mask = fcmp one <16 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %r = load <16 x float>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer
+  ret <16 x float>%res
+}
+
+; CHECK-LABEL: test44
+; CHECK: vmovapd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+  %mask = fcmp one <8 x double> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %r = load <8 x double>* %vaddr, align 64
+  %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old
+  ret <8 x double>%res
+}
+
+; CHECK-LABEL: test45
+; CHECK: vmovupd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+  %mask = fcmp one <8 x double> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %r = load <8 x double>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old
+  ret <8 x double>%res
+}
+
+; CHECK-LABEL: test46
+; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) {
+  %mask = fcmp one <8 x double> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %r = load <8 x double>* %vaddr, align 64
+  %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer
+  ret <8 x double>%res
+}
+
+; CHECK-LABEL: test47
+; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x double> @test47(i8 * %addr, <8 x double> %mask1) {
+  %mask = fcmp one <8 x double> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %r = load <8 x double>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer
+  ret <8 x double>%res
+}
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 83f4698..0dbf286 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -39,3 +39,56 @@ define double @select03(double %a, double %b, double %c, double %eps) {
   %cond = select i1 %cmp, double %c, double %b
   ret double %cond
 }
+
+; CHECK-LABEL: @select04
+; CHECK: vmovaps %zmm3, %zmm1
+; CHECK-NEXT: ret
+; PR20677
+define <16 x double> @select04(<16 x double> %a, <16 x double> %b) {
+  %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b
+  ret <16 x double> %sel
+}
+
+; CHECK-LABEL: select05
+; CHECK: kmovw   %esi, %k0
+; CHECK-NEXT: kmovw   %edi, %k1
+; CHECK-NEXT: korw    %k1, %k0, %k0
+; CHECK-NEXT: kmovw   %k0, %eax
+define i8 @select05(i8 %a.0, i8 %m) {
+  %mask = bitcast i8 %m to <8 x i1>
+  %a = bitcast i8 %a.0 to <8 x i1>
+  %r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a
+  %res = bitcast <8 x i1> %r to i8
+  ret i8 %res;
+}
+
+; CHECK-LABEL: select06
+; CHECK: kmovw   %esi, %k0
+; CHECK-NEXT: kmovw   %edi, %k1
+; CHECK-NEXT: kandw    %k1, %k0, %k0
+; CHECK-NEXT: kmovw   %k0, %eax
+define i8 @select06(i8 %a.0, i8 %m) {
+  %mask = bitcast i8 %m to <8 x i1>
+  %a = bitcast i8 %a.0 to <8 x i1>
+  %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer
+  %res = bitcast <8 x i1> %r to i8
+  ret i8 %res;
+}
+
+; CHECK-LABEL: select07
+; CHECK-DAG:  kmovw   %edx, %k0
+; CHECK-DAG:  kmovw   %edi, %k1
+; CHECK-DAG:  kmovw   %esi, %k2
+; CHECK: kandw %k0, %k1, %k1
+; CHECK-NEXT: knotw    %k0, %k0
+; CHECK-NEXT: kandw    %k0, %k2, %k0
+; CHECK-NEXT: korw %k0, %k1, %k0
+; CHECK-NEXT: kmovw   %k0, %eax
+define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
+  %mask = bitcast i8 %m to <8 x i1>
+  %a = bitcast i8 %a.0 to <8 x i1>
+  %b = bitcast i8 %b.0 to <8 x i1>
+  %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> %b
+  %res = bitcast <8 x i1> %r to i8
+  ret i8 %res;
+}
diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll
deleted file mode 100644
index b99e89a..0000000
--- a/test/CodeGen/X86/avx512-shuffle.ll
+++ /dev/null
@@ -1,314 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
-; CHECK: LCP
-; CHECK: .long 2
-; CHECK: .long 5
-; CHECK: .long 0
-; CHECK: .long 0
-; CHECK: .long 7
-; CHECK: .long 0
-; CHECK: .long 10
-; CHECK: .long 1
-; CHECK: .long 0
-; CHECK: .long 5
-; CHECK: .long 0
-; CHECK: .long 4
-; CHECK: .long 7
-; CHECK: .long 0
-; CHECK: .long 10
-; CHECK: .long 1
-; CHECK-LABEL: test1:
-; CHECK: vpermps
-; CHECK: ret
-define <16 x float> @test1(<16 x float> %a) nounwind {
-  %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
-  ret <16 x float> %c
-}
-
-; CHECK-LABEL: test2:
-; CHECK: vpermd
-; CHECK: ret
-define <16 x i32> @test2(<16 x i32> %a) nounwind {
-  %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
-  ret <16 x i32> %c
-}
-
-; CHECK-LABEL: test3:
-; CHECK: vpermq
-; CHECK: ret
-define <8 x i64> @test3(<8 x i64> %a) nounwind {
-  %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1>
-  ret <8 x i64> %c
-}
-
-; CHECK-LABEL: test4:
-; CHECK: vpermpd
-; CHECK: ret
-define <8 x double> @test4(<8 x double> %a) nounwind {
-  %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <8 x double> %c
-}
-
-; CHECK-LABEL: test5:
-; CHECK: vpermt2pd
-; CHECK: ret
-define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
-  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
-  ret <8 x double> %c
-}
-
-; The reg variant of vpermt2 with a writemask
-; CHECK-LABEL: test5m:
-; CHECK: vpermt2pd {{.* {%k[1-7]} {z}}}
-define <8 x double> @test5m(<8 x double> %a, <8 x double> %b, i8 %mask) nounwind {
-  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
-  %m = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %m, <8 x double> %c, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-; CHECK-LABEL: test6:
-; CHECK: vpermq $30
-; CHECK: ret
-define <8 x i64> @test6(<8 x i64> %a) nounwind {
-  %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
-  ret <8 x i64> %c
-}
-
-; CHECK-LABEL: test7:
-; CHECK: vpermt2q
-; CHECK: ret
-define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
-  %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
-  ret <8 x i64> %c
-}
-
-; The reg variant of vpermt2 with a writemask
-; CHECK-LABEL: test7m:
-; CHECK: vpermt2q {{.* {%k[1-7]} {z}}}
-define <8 x i64> @test7m(<8 x i64> %a, <8 x i64> %b, i8 %mask) nounwind {
-  %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
-  %m = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-; The mem variant of vpermt2 with a writemask
-; CHECK-LABEL: test7mm:
-; CHECK: vpermt2q {{\(.*\).* {%k[1-7]} {z}}}
-define <8 x i64> @test7mm(<8 x i64> %a, <8 x i64> *%pb, i8 %mask) nounwind {
-  %b = load <8 x i64>* %pb
-  %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
-  %m = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-; CHECK-LABEL: test8:
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
-  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  ret <16 x i32> %c
-}
-
-; The reg variant of vpermt2 with a writemask
-; CHECK-LABEL: test8m:
-; CHECK: vpermt2d {{.* {%k[1-7]} {z}}}
-define <16 x i32> @test8m(<16 x i32> %a, <16 x i32> %b, i16 %mask) nounwind {
-  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  %m = bitcast i16 %mask to <16 x i1>
-  %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-; The mem variant of vpermt2 with a writemask
-; CHECK-LABEL: test8mm:
-; CHECK: vpermt2d {{\(.*\).* {%k[1-7]} {z}}}
-define <16 x i32> @test8mm(<16 x i32> %a, <16 x i32> *%pb, i16 %mask) nounwind {
-  %b = load <16 x i32> * %pb
-  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  %m = bitcast i16 %mask to <16 x i1>
-  %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-; CHECK-LABEL: test9:
-; CHECK: vpermt2ps
-; CHECK: ret
-define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
-  %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  ret <16 x float> %c
-}
-
-; The reg variant of vpermt2 with a writemask
-; CHECK-LABEL: test9m:
-; CHECK: vpermt2ps {{.*}} {%k{{.}}} {z}
-define <16 x float> @test9m(<16 x float> %a, <16 x float> %b, i16 %mask) nounwind {
-  %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  %m = bitcast i16 %mask to <16 x i1>
-  %res = select <16 x i1> %m, <16 x float> %c, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-; CHECK-LABEL: test10:
-; CHECK: vpermt2ps (
-; CHECK: ret
-define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
-  %c = load <16 x float>* %b
-  %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  ret <16 x float> %d
-}
-
-; CHECK-LABEL: test11:
-; CHECK: vpermt2d 
-; CHECK: ret
-define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
-  %c = load <16 x i32>* %b
-  %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  ret <16 x i32> %d
-}
-
-; CHECK-LABEL: test12
-; CHECK: vmovlhps {{.*}}## encoding: [0x62
-; CHECK: ret
-define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) nounwind {
-  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  ret <4 x i32> %c
-}
-
-; CHECK-LABEL: test13
-; CHECK: vpermilps $-79, %zmm
-; CHECK: ret
-define <16 x float> @test13(<16 x float> %a) {
- %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: test14
-; CHECK: vpermilpd $-53, %zmm
-; CHECK: ret
-define <8 x double> @test14(<8 x double> %a) {
- %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7>
- ret <8 x double> %b
-}
-
-; CHECK-LABEL: test15
-; CHECK: vpshufd $-79, %zmm
-; CHECK: ret
-define <16 x i32> @test15(<16 x i32> %a) {
- %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
- ret <16 x i32> %b
-}
-; CHECK-LABEL: test16
-; CHECK: valignq $2, %zmm0, %zmm1
-; CHECK: ret
-define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind {
-  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-  ret <8 x double> %c
-}
-
-; CHECK-LABEL: test17
-; CHECK: vshufpd $19, %zmm1, %zmm0
-; CHECK: ret
-define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind {
-  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef>
-  ret <8 x double> %c
-}
-
-; CHECK-LABEL: test18
-; CHECK: vpunpckhdq %zmm
-; CHECK: ret
-define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) {
- %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15, i32 18, i32 26, i32 19, i32 27, i32 22, i32 30, i32 23, i32 31>
- ret <16 x i32> %b
-}
-
-; CHECK-LABEL: test19
-; CHECK: vpunpckldq %zmm
-; CHECK: ret
-define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) {
- %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29>
- ret <16 x i32> %b
-}
-
-; CHECK-LABEL: test20
-; CHECK: vpunpckhqdq  %zmm
-; CHECK: ret
-define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) {
- %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15>
- ret <8 x i64> %b
-}
-
-; CHECK-LABEL: test21
-; CHECK: vunpcklps %zmm
-; CHECK: ret
-define <16 x float> @test21(<16 x float> %a, <16 x float> %c) {
- %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: test22
-; CHECK: vmovhlps {{.*}}## encoding: [0x62
-; CHECK: ret
-define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) nounwind {
-  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  ret <4 x i32> %c
-}
-
-; CHECK-LABEL: @test23
-; CHECK: vshufps $-112, %zmm
-; CHECK: ret
-define <16 x float> @test23(<16 x float> %a, <16 x float> %c) {
- %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: @test24
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind {
-  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test25
-; CHECK: vshufps  $52
-; CHECK: ret
-define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind {
-  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 19, i32 undef, i32 4, i32 5, i32 23, i32 undef, i32 8, i32 9, i32 27, i32 undef, i32 12, i32 13, i32 undef, i32 undef>
-  ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test26
-; CHECK: vmovshdup
-; CHECK: ret
-define <16 x i32> @test26(<16 x i32> %a) nounwind {
-  %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef>
-  ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test27
-; CHECK: ret
-define <16 x i32> @test27(<4 x i32>%a) {
- %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %res
-}
-
-; CHECK-LABEL: @test28
-; CHECK: vinserti64x4 $1
-; CHECK: ret
-define <16 x i32> @test28(<16 x i32>%x, <16 x i32>%y) {
- %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                                                              i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
- ret <16 x i32> %res
-}
-
-; CHECK-LABEL: @test29
-; CHECK: vinserti64x4 $0
-; CHECK: ret
-define <16 x i32> @test29(<16 x i32>%x, <16 x i32>%y) {
- %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
-                                                              i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- ret <16 x i32> %res
-}
-
diff --git a/test/CodeGen/X86/avx512-trunc-ext.ll b/test/CodeGen/X86/avx512-trunc-ext.ll
index 5e097be..91ef5d5 100644
--- a/test/CodeGen/X86/avx512-trunc-ext.ll
+++ b/test/CodeGen/X86/avx512-trunc-ext.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s
 
 ; CHECK-LABEL: trunc_16x32_to_16x8
 ; CHECK: vpmovdb
@@ -118,6 +119,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
 
 ; CHECK-LABEL: sext_8i1_8i32
 ; CHECK: vpbroadcastq  LCP{{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX: vpmovm2d
 ; CHECK: ret
 define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
   %x = icmp slt <8 x i32> %a1, %a2
@@ -135,9 +137,8 @@ define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) {
 }
 
 ; CHECK-LABEL: trunc_i32_to_i1
-; CHECK: andl
-; CHECK: kmov
-; CHECK: kortest
+; CHECK: testb
+; CHECK: setne
 ; CKECK: orl
 ; CHECK: ret
 define i16 @trunc_i32_to_i1(i32 %a) {
@@ -146,3 +147,30 @@ define i16 @trunc_i32_to_i1(i32 %a) {
   %res = bitcast <16 x i1> %maskv to i16
   ret i16 %res
 }
+
+; CHECK-LABEL: sext_8i1_8i16
+; SKX: vpmovm2w
+; CHECK: ret
+define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+  %x = icmp slt <8 x i32> %a1, %a2
+  %y = sext <8 x i1> %x to <8 x i16>
+  ret <8 x i16> %y
+}
+
+; CHECK-LABEL: sext_16i1_16i32
+; SKX: vpmovm2d
+; CHECK: ret
+define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
+  %x = icmp slt <16 x i32> %a1, %a2
+  %y = sext <16 x i1> %x to <16 x i32>
+  ret <16 x i32> %y
+}
+
+; CHECK-LABEL: sext_8i1_8i64
+; SKX: vpmovm2q
+; CHECK: ret
+define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+  %x = icmp slt <8 x i32> %a1, %a2
+  %y = sext <8 x i1> %x to <8 x i64>
+  ret <8 x i64> %y
+}
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 9c6db11..0b0e0fc 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -1,59 +1,72 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-;CHECK-LABEL: _inreg16xi32:
-;CHECK: vpbroadcastd {{.*}}, %zmm
-;CHECK: ret
 define   <16 x i32> @_inreg16xi32(i32 %a) {
+; CHECK-LABEL: _inreg16xi32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastd %edi, %zmm0
+; CHECK-NEXT:    retq
   %b = insertelement <16 x i32> undef, i32 %a, i32 0
   %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
   ret <16 x i32> %c
 }
 
-;CHECK-LABEL: _inreg8xi64:
-;CHECK: vpbroadcastq {{.*}}, %zmm
-;CHECK: ret
 define   <8 x i64> @_inreg8xi64(i64 %a) {
+; CHECK-LABEL: _inreg8xi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0
+; CHECK-NEXT:    retq
   %b = insertelement <8 x i64> undef, i64 %a, i32 0
   %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
   ret <8 x i64> %c
 }
 
-;CHECK-LABEL: _inreg16xfloat:
-;CHECK: vbroadcastss {{.*}}, %zmm
-;CHECK: ret
 define   <16 x float> @_inreg16xfloat(float %a) {
+; CHECK-LABEL: _inreg16xfloat:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = insertelement <16 x float> undef, float %a, i32 0
   %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
   ret <16 x float> %c
 }
 
-;CHECK-LABEL: _inreg8xdouble:
-;CHECK: vbroadcastsd {{.*}}, %zmm
-;CHECK: ret
 define   <8 x double> @_inreg8xdouble(double %a) {
+; CHECK-LABEL: _inreg8xdouble:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = insertelement <8 x double> undef, double %a, i32 0
   %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
   ret <8 x double> %c
 }
 
-;CHECK-LABEL: _xmm16xi32
-;CHECK: vpbroadcastd
-;CHECK: ret
 define   <16 x i32> @_xmm16xi32(<16 x i32> %a) {
+; CHECK-LABEL: _xmm16xi32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer
   ret <16 x i32> %b
 }
 
-;CHECK-LABEL: _xmm16xfloat
-;CHECK: vbroadcastss {{.*}}## encoding: [0x62
-;CHECK: ret
 define   <16 x float> @_xmm16xfloat(<16 x float> %a) {
+; CHECK-LABEL: _xmm16xfloat:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer
   ret <16 x float> %b
 }
 
 define <16 x i32> @test_vbroadcast() {
-  ; CHECK: vpbroadcastd
+; CHECK-LABEL: test_vbroadcast:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vcmpunordps %zmm0, %zmm0, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK-NEXT:    knotw %k1, %k1
+; CHECK-NEXT:    vmovdqu32 %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
 entry:
   %0 = sext <16 x i1> zeroinitializer to <16 x i32>
   %1 = fcmp uno <16 x float> undef, zeroinitializer
@@ -62,3 +75,108 @@ entry:
   ret <16 x i32> %3
 }
 
+; We implement the set1 intrinsics with vector initializers.  Verify that the
+; IR generated will produce broadcasts at the end.
+define <8 x double> @test_set1_pd(double %d) #2 {
+; CHECK-LABEL: test_set1_pd:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %vecinit.i = insertelement <8 x double> undef, double %d, i32 0
+  %vecinit1.i = insertelement <8 x double> %vecinit.i, double %d, i32 1
+  %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %d, i32 2
+  %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %d, i32 3
+  %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %d, i32 4
+  %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %d, i32 5
+  %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %d, i32 6
+  %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %d, i32 7
+  ret <8 x double> %vecinit7.i
+}
+
+define <8 x i64> @test_set1_epi64(i64 %d) #2 {
+; CHECK-LABEL: test_set1_epi64:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %vecinit.i = insertelement <8 x i64> undef, i64 %d, i32 0
+  %vecinit1.i = insertelement <8 x i64> %vecinit.i, i64 %d, i32 1
+  %vecinit2.i = insertelement <8 x i64> %vecinit1.i, i64 %d, i32 2
+  %vecinit3.i = insertelement <8 x i64> %vecinit2.i, i64 %d, i32 3
+  %vecinit4.i = insertelement <8 x i64> %vecinit3.i, i64 %d, i32 4
+  %vecinit5.i = insertelement <8 x i64> %vecinit4.i, i64 %d, i32 5
+  %vecinit6.i = insertelement <8 x i64> %vecinit5.i, i64 %d, i32 6
+  %vecinit7.i = insertelement <8 x i64> %vecinit6.i, i64 %d, i32 7
+  ret <8 x i64> %vecinit7.i
+}
+
+define <16 x float> @test_set1_ps(float %f) #2 {
+; CHECK-LABEL: test_set1_ps:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %vecinit.i = insertelement <16 x float> undef, float %f, i32 0
+  %vecinit1.i = insertelement <16 x float> %vecinit.i, float %f, i32 1
+  %vecinit2.i = insertelement <16 x float> %vecinit1.i, float %f, i32 2
+  %vecinit3.i = insertelement <16 x float> %vecinit2.i, float %f, i32 3
+  %vecinit4.i = insertelement <16 x float> %vecinit3.i, float %f, i32 4
+  %vecinit5.i = insertelement <16 x float> %vecinit4.i, float %f, i32 5
+  %vecinit6.i = insertelement <16 x float> %vecinit5.i, float %f, i32 6
+  %vecinit7.i = insertelement <16 x float> %vecinit6.i, float %f, i32 7
+  %vecinit8.i = insertelement <16 x float> %vecinit7.i, float %f, i32 8
+  %vecinit9.i = insertelement <16 x float> %vecinit8.i, float %f, i32 9
+  %vecinit10.i = insertelement <16 x float> %vecinit9.i, float %f, i32 10
+  %vecinit11.i = insertelement <16 x float> %vecinit10.i, float %f, i32 11
+  %vecinit12.i = insertelement <16 x float> %vecinit11.i, float %f, i32 12
+  %vecinit13.i = insertelement <16 x float> %vecinit12.i, float %f, i32 13
+  %vecinit14.i = insertelement <16 x float> %vecinit13.i, float %f, i32 14
+  %vecinit15.i = insertelement <16 x float> %vecinit14.i, float %f, i32 15
+  ret <16 x float> %vecinit15.i
+}
+
+define <16 x i32> @test_set1_epi32(i32 %f) #2 {
+; CHECK-LABEL: test_set1_epi32:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastd %edi, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %vecinit.i = insertelement <16 x i32> undef, i32 %f, i32 0
+  %vecinit1.i = insertelement <16 x i32> %vecinit.i, i32 %f, i32 1
+  %vecinit2.i = insertelement <16 x i32> %vecinit1.i, i32 %f, i32 2
+  %vecinit3.i = insertelement <16 x i32> %vecinit2.i, i32 %f, i32 3
+  %vecinit4.i = insertelement <16 x i32> %vecinit3.i, i32 %f, i32 4
+  %vecinit5.i = insertelement <16 x i32> %vecinit4.i, i32 %f, i32 5
+  %vecinit6.i = insertelement <16 x i32> %vecinit5.i, i32 %f, i32 6
+  %vecinit7.i = insertelement <16 x i32> %vecinit6.i, i32 %f, i32 7
+  %vecinit8.i = insertelement <16 x i32> %vecinit7.i, i32 %f, i32 8
+  %vecinit9.i = insertelement <16 x i32> %vecinit8.i, i32 %f, i32 9
+  %vecinit10.i = insertelement <16 x i32> %vecinit9.i, i32 %f, i32 10
+  %vecinit11.i = insertelement <16 x i32> %vecinit10.i, i32 %f, i32 11
+  %vecinit12.i = insertelement <16 x i32> %vecinit11.i, i32 %f, i32 12
+  %vecinit13.i = insertelement <16 x i32> %vecinit12.i, i32 %f, i32 13
+  %vecinit14.i = insertelement <16 x i32> %vecinit13.i, i32 %f, i32 14
+  %vecinit15.i = insertelement <16 x i32> %vecinit14.i, i32 %f, i32 15
+  ret <16 x i32> %vecinit15.i
+}
+
+; We implement the scalar broadcast intrinsics with vector initializers.
+; Verify that the IR generated will produce the broadcast at the end.
+define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) {
+; CHECK-LABEL: test_mm512_broadcastsd_pd:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = extractelement <2 x double> %a, i32 0
+  %vecinit.i = insertelement <8 x double> undef, double %0, i32 0
+  %vecinit1.i = insertelement <8 x double> %vecinit.i, double %0, i32 1
+  %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %0, i32 2
+  %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %0, i32 3
+  %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %0, i32 4
+  %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %0, i32 5
+  %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %0, i32 6
+  %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %0, i32 7
+  ret <8 x double> %vecinit7.i
+}
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index d762f00..c71e60e 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1,145 +1,176 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-; CHECK-LABEL: test1
-; CHECK: vcmpleps
-; CHECK: vmovups
-; CHECK: ret
 define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmpleps %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %mask = fcmp ole <16 x float> %x, %y
   %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
   ret <16 x float> %max
 }
 
-; CHECK-LABEL: test2
-; CHECK: vcmplepd
-; CHECK: vmovupd
-; CHECK: ret
 define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmplepd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %mask = fcmp ole <8 x double> %x, %y
   %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
   ret <8 x double> %max
 }
 
-; CHECK-LABEL: test3
-; CHECK: vpcmpeqd  (%rdi)
-; CHECK: vmovdqu32
-; CHECK: ret
 define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqd (%rdi), %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %y = load <16 x i32>* %yp, align 4
   %mask = icmp eq <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
   ret <16 x i32> %max
 }
 
-; CHECK-LABEL: @test4_unsigned
-; CHECK: vpcmpnltud
-; CHECK: vmovdqu32
-; CHECK: ret
 define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind {
+; CHECK-LABEL: test4_unsigned:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %mask = icmp uge <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
   ret <16 x i32> %max
 }
 
-; CHECK-LABEL: test5
-; CHECK: vpcmpeqq {{.*}}%k1
-; CHECK: vmovdqu64 {{.*}}%k1
-; CHECK: ret
 define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
+; CHECK-LABEL: test5:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %mask = icmp eq <8 x i64> %x, %y
   %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
   ret <8 x i64> %max
 }
 
-; CHECK-LABEL: test6_unsigned
-; CHECK: vpcmpnleuq {{.*}}%k1
-; CHECK: vmovdqu64 {{.*}}%k1
-; CHECK: ret
 define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y) nounwind {
+; CHECK-LABEL: test6_unsigned:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %mask = icmp ugt <8 x i64> %x, %y
   %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
   ret <8 x i64> %max
 }
 
-; CHECK-LABEL: test7
-; CHECK: xor
-; CHECK: vcmpltps
-; CHECK: vblendvps
-; CHECK: ret
 define <4 x float> @test7(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test7:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpltps %xmm2, %xmm0, %xmm2
+; CHECK-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %mask = fcmp olt <4 x float> %a, zeroinitializer
   %c = select <4 x i1>%mask, <4 x float>%a, <4 x float>%b
   ret <4 x float>%c
 }
 
-; CHECK-LABEL: test8
-; CHECK: xor
-; CHECK: vcmpltpd
-; CHECK: vblendvpd
-; CHECK: ret
 define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpltpd %xmm2, %xmm0, %xmm2
+; CHECK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %mask = fcmp olt <2 x double> %a, zeroinitializer
   %c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b
   ret <2 x double>%c
 }
 
-; CHECK-LABEL: test9
-; CHECK: vpcmpeqd
-; CHECK: vpblendmd
-; CHECK: ret
 define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-LABEL: test9:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:      ## kill: YMM1<def> YMM1<kill> ZMM1<def>
+; CHECK-NEXT:      ## kill: YMM0<def> YMM0<kill> ZMM0<def>
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:      ## kill: YMM0<def> YMM0<kill> ZMM0<kill>
+; CHECK-NEXT:    retq
   %mask = icmp eq <8 x i32> %x, %y
   %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
   ret <8 x i32> %max
 }
 
-; CHECK-LABEL: test10
-; CHECK: vcmpeqps
-; CHECK: vblendmps
-; CHECK: ret
 define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
+; CHECK-LABEL: test10:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:      ## kill: YMM1<def> YMM1<kill> ZMM1<def>
+; CHECK-NEXT:      ## kill: YMM0<def> YMM0<kill> ZMM0<def>
+; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:      ## kill: YMM0<def> YMM0<kill> ZMM0<kill>
+; CHECK-NEXT:    retq
   %mask = fcmp oeq <8 x float> %x, %y
   %max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
   ret <8 x float> %max
 }
 
-; CHECK-LABEL: test11_unsigned
-; CHECK: vpmaxud
-; CHECK: ret
 define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-LABEL: test11_unsigned:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %mask = icmp ugt <8 x i32> %x, %y
   %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
   ret <8 x i32> %max
 }
 
-; CHECK-LABEL: test12
-; CHECK: vpcmpeqq        %zmm2, %zmm0, [[LO:%k[0-7]]]
-; CHECK: vpcmpeqq        %zmm3, %zmm1, [[HI:%k[0-7]]]
-; CHECK: kunpckbw        [[LO]], [[HI]], {{%k[0-7]}}
 
 define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
+; CHECK-LABEL: test12:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    kunpckbw %k0, %k1, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:      ## kill: AX<def> AX<kill> EAX<kill>
+; CHECK-NEXT:    retq
   %res = icmp eq <16 x i64> %a, %b
   %res1 = bitcast <16 x i1> %res to i16
   ret i16 %res1
 }
 
-; CHECK-LABEL: test13
-; CHECK: vcmpeqps        %zmm
-; CHECK: vpbroadcastd
-; CHECK: ret
 define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
+; CHECK-LABEL: test13:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
 {
   %cmpvector_i = fcmp oeq <16 x float> %a, %b
   %conv = zext <16 x i1> %cmpvector_i to <16 x i32>
   ret <16 x i32> %conv
 }
 
-; CHECK-LABEL: test14
-; CHECK: vpcmp
-; CHECK-NOT: vpcmp
-; CHECK: vmovdqu32 {{.*}}{%k1} {z}
-; CHECK: ret
 define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
+; CHECK-LABEL: test14:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
+; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    knotw %k0, %k1
+; CHECK-NEXT:    vmovdqu32 %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %sub_r = sub <16 x i32> %a, %b
   %cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a
   %sext.i3.i = sext <16 x i1> %cmp.i2.i to <16 x i32>
@@ -148,12 +179,15 @@ define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
   ret <16 x i32>%res
 }
 
-; CHECK-LABEL: test15
-; CHECK: vpcmpgtq
-; CHECK-NOT: vpcmp
-; CHECK: vmovdqu64 {{.*}}{%k1} {z}
-; CHECK: ret
 define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
+; CHECK-LABEL: test15:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm1
+; CHECK-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    knotw %k0, %k1
+; CHECK-NEXT:    vmovdqu64 %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %sub_r = sub <8 x i64> %a, %b
   %cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a
   %sext.i3.i = sext <8 x i1> %cmp.i2.i to <8 x i64>
@@ -162,3 +196,181 @@ define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
   ret <8 x i64>%res
 }
 
+define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y) nounwind {
+; CHECK-LABEL: test16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpled %zmm0, %zmm1, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %mask = icmp sge <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+  ret <16 x i32> %max
+}
+
+define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test17:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpgtd (%rdi), %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %y = load <16 x i32>* %y.ptr, align 4
+  %mask = icmp sgt <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
+}
+
+define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test18:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpled (%rdi), %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %y = load <16 x i32>* %y.ptr, align 4
+  %mask = icmp sle <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
+}
+
+define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test19:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpleud (%rdi), %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %y = load <16 x i32>* %y.ptr, align 4
+  %mask = icmp ule <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
+}
+
+define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) nounwind {
+; CHECK-LABEL: test20:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %mask1 = icmp eq <16 x i32> %x1, %y1
+  %mask0 = icmp eq <16 x i32> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+  ret <16 x i32> %max
+}
+
+define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) nounwind {
+; CHECK-LABEL: test21:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vpcmpleq %zmm2, %zmm3, %k1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %mask1 = icmp sge <8 x i64> %x1, %y1
+  %mask0 = icmp sle <8 x i64> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
+  ret <8 x i64> %max
+}
+
+define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
+; CHECK-LABEL: test22:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpgtq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %mask1 = icmp sgt <8 x i64> %x1, %y1
+  %y = load <8 x i64>* %y.ptr, align 4
+  %mask0 = icmp sgt <8 x i64> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
+  ret <8 x i64> %max
+}
+
+define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
+; CHECK-LABEL: test23:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpled %zmm1, %zmm2, %k1
+; CHECK-NEXT:    vpcmpleud (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %mask1 = icmp sge <16 x i32> %x1, %y1
+  %y = load <16 x i32>* %y.ptr, align 4
+  %mask0 = icmp ule <16 x i32> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
+}
+
+define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
+; CHECK-LABEL: test24:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k1
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %yb = load i64* %yb.ptr, align 4
+  %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
+  %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
+  %mask = icmp eq <8 x i64> %x, %y
+  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
+  ret <8 x i64> %max
+}
+
+define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind {
+; CHECK-LABEL: test25:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpled (%rdi){1to16}, %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %yb = load i32* %yb.ptr, align 4
+  %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
+  %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
+  %mask = icmp sle <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
+}
+
+define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
+; CHECK-LABEL: test26:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpled %zmm1, %zmm2, %k1
+; CHECK-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %mask1 = icmp sge <16 x i32> %x1, %y1
+  %yb = load i32* %yb.ptr, align 4
+  %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
+  %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
+  %mask0 = icmp sgt <16 x i32> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
+}
+
+define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
+; CHECK-LABEL: test27:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpleq        %zmm1, %zmm2, %k1
+; CHECK-NEXT:    vpcmpleq        (%rdi){1to8}, %zmm0, %k1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %mask1 = icmp sge <8 x i64> %x1, %y1
+  %yb = load i64* %yb.ptr, align 4
+  %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
+  %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
+  %mask0 = icmp sle <8 x i64> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
+  ret <8 x i64> %max
+}
diff --git a/test/CodeGen/X86/avx512-zext-load-crash.ll b/test/CodeGen/X86/avx512-zext-load-crash.ll
deleted file mode 100644
index 07ded13..0000000
--- a/test/CodeGen/X86/avx512-zext-load-crash.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-
-define <8 x i16> @test_zext_load() {
-  ; CHECK: vmovq
-entry:
-  %0 = load <2 x i16> ** undef, align 8
-  %1 = getelementptr inbounds <2 x i16>* %0, i64 1
-  %2 = load <2 x i16>* %0, align 1
-  %3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %4 = load <2 x i16>* %1, align 1
-  %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <8 x i16> %6
-}
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
new file mode 100644
index 0000000..bbc418c
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -0,0 +1,305 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw --show-mc-encoding| FileCheck %s
+
+define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
+; CHECK-LABEL: test_pcmpeq_b
+; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
+  %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
+  ret i64 %res
+}
+
+define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_b
+; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
+  ret i64 %res
+}
+
+declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
+; CHECK-LABEL: test_pcmpeq_w
+; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
+  ret i32 %res
+}
+
+define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_w
+; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
+
+define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
+; CHECK-LABEL: test_pcmpgt_b
+; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 ##
+  %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
+  ret i64 %res
+}
+
+define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_b
+; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
+  ret i64 %res
+}
+
+declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
+; CHECK-LABEL: test_pcmpgt_w
+; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
+  ret i32 %res
+}
+
+define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_w
+; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
+
+define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+; CHECK_LABEL: test_cmp_b_512
+; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
+  %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
+  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
+; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ##
+  %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
+  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
+; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ##
+  %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
+  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
+; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ##
+  %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
+  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
+; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ##
+  %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
+  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
+; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ##
+  %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
+  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
+; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ##
+  %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
+  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
+; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ##
+  %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
+  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
+  ret <8 x i64> %vec7
+}
+
+define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+; CHECK_LABEL: test_mask_cmp_b_512
+; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
+  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
+; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
+  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
+; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
+  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
+; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
+  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
+; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
+  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
+; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
+  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
+; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
+  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
+; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
+  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
+  ret <8 x i64> %vec7
+}
+
+declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
+
+define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+; CHECK_LABEL: test_ucmp_b_512
+; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ##
+  %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
+  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
+; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ##
+  %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
+  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
+; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ##
+  %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
+  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
+; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ##
+  %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
+  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
+; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ##
+  %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
+  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
+; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ##
+  %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
+  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
+; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ##
+  %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
+  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
+; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ##
+  %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
+  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
+  ret <8 x i64> %vec7
+}
+
+define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+; CHECK_LABEL: test_mask_ucmp_b_512
+; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
+  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
+; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
+  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
+; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
+  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
+; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
+  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
+; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
+  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
+; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
+  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
+; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
+  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
+; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
+  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
+  ret <8 x i64> %vec7
+}
+
+declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
+
+define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK_LABEL: test_cmp_w_512
+; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
+  %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ##
+  %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmplew %zmm1, %zmm0, %k0 ##
+  %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ##
+  %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ##
+  %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ##
+  %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ##
+  %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ##
+  %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+; CHECK_LABEL: test_mask_cmp_w_512
+; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
+
+define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK_LABEL: test_ucmp_w_512
+; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ##
+  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ##
+  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ##
+  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ##
+  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ##
+  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ##
+  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ##
+  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ##
+  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+; CHECK_LABEL: test_mask_ucmp_w_512
+; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/avx512bw-mask-op.ll b/test/CodeGen/X86/avx512bw-mask-op.ll
new file mode 100644
index 0000000..9d7630c
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-mask-op.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+define i32 @mask32(i32 %x) {
+  %m0 = bitcast i32 %x to <32 x i1>
+  %m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <32 x i1> %m1 to i32
+  ret i32 %ret
+; CHECK-LABEL: mask32
+; CHECK: kmovd
+; CHECK-NEXT: knotd
+; CHECK-NEXT: kmovd
+; CHECK_NEXT: ret
+}
+
+define i64 @mask64(i64 %x) {
+  %m0 = bitcast i64 %x to <64 x i1>
+  %m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <64 x i1> %m1 to i64
+  ret i64 %ret
+; CHECK-LABEL: mask64
+; CHECK: kmovq
+; CHECK-NEXT: knotq
+; CHECK-NEXT: kmovq
+; CHECK_NEXT: ret
+}
+
+define void @mask32_mem(i32* %ptr) {
+  %x = load i32* %ptr, align 4
+  %m0 = bitcast i32 %x to <32 x i1>
+  %m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <32 x i1> %m1 to i32
+  store i32 %ret, i32* %ptr, align 4
+  ret void
+; CHECK-LABEL: mask32_mem
+; CHECK: kmovd ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
+; CHECK-NEXT: knotd
+; CHECK-NEXT: kmovd %k{{[0-7]}}, ([[ARG1]])
+; CHECK_NEXT: ret
+}
+
+define void @mask64_mem(i64* %ptr) {
+  %x = load i64* %ptr, align 4
+  %m0 = bitcast i64 %x to <64 x i1>
+  %m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <64 x i1> %m1 to i64
+  store i64 %ret, i64* %ptr, align 4
+  ret void
+; CHECK-LABEL: mask64_mem
+; CHECK: kmovq ([[ARG1]]), %k{{[0-7]}}
+; CHECK-NEXT: knotq
+; CHECK-NEXT: kmovq %k{{[0-7]}}, ([[ARG1]])
+; CHECK_NEXT: ret
+}
+
+define i32 @mand32(i32 %x, i32 %y) {
+  %ma = bitcast i32 %x to <32 x i1>
+  %mb = bitcast i32 %y to <32 x i1>
+  %mc = and <32 x i1> %ma, %mb
+  %md = xor <32 x i1> %ma, %mb
+  %me = or <32 x i1> %mc, %md
+  %ret = bitcast <32 x i1> %me to i32
+; CHECK: kandd
+; CHECK: kxord
+; CHECK: kord
+  ret i32 %ret
+}
+
+define i64 @mand64(i64 %x, i64 %y) {
+  %ma = bitcast i64 %x to <64 x i1>
+  %mb = bitcast i64 %y to <64 x i1>
+  %mc = and <64 x i1> %ma, %mb
+  %md = xor <64 x i1> %ma, %mb
+  %me = or <64 x i1> %mc, %md
+  %ret = bitcast <64 x i1> %me to i64
+; CHECK: kandq
+; CHECK: kxorq
+; CHECK: korq
+  ret i64 %ret
+}
diff --git a/test/CodeGen/X86/avx512bw-mov.ll b/test/CodeGen/X86/avx512bw-mov.ll
new file mode 100644
index 0000000..2ff6d28
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-mov.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: vmovdqu8
+; CHECK: ret
+define <64 x i8> @test1(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <64 x i8>*
+  %res = load <64 x i8>* %vaddr, align 1
+  ret <64 x i8>%res
+}
+
+; CHECK-LABEL: test2
+; CHECK: vmovdqu8
+; CHECK: ret
+define void @test2(i8 * %addr, <64 x i8> %data) {
+  %vaddr = bitcast i8* %addr to <64 x i8>*
+  store <64 x i8>%data, <64 x i8>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test3
+; CHECK: vmovdqu8{{.*{%k[1-7]}}}
+; CHECK: ret
+define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
+  %mask = icmp ne <64 x i8> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <64 x i8>*
+  %r = load <64 x i8>* %vaddr, align 1
+  %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> %old
+  ret <64 x i8>%res
+}
+
+; CHECK-LABEL: test4
+; CHECK: vmovdqu8{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) {
+  %mask = icmp ne <64 x i8> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <64 x i8>*
+  %r = load <64 x i8>* %vaddr, align 1
+  %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> zeroinitializer
+  ret <64 x i8>%res
+}
+
+; CHECK-LABEL: test5
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test5(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <32 x i16>*
+  %res = load <32 x i16>* %vaddr, align 1
+  ret <32 x i16>%res
+}
+
+; CHECK-LABEL: test6
+; CHECK: vmovdqu16
+; CHECK: ret
+define void @test6(i8 * %addr, <32 x i16> %data) {
+  %vaddr = bitcast i8* %addr to <32 x i16>*
+  store <32 x i16>%data, <32 x i16>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test7
+; CHECK: vmovdqu16{{.*{%k[1-7]}}}
+; CHECK: ret
+define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
+  %mask = icmp ne <32 x i16> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <32 x i16>*
+  %r = load <32 x i16>* %vaddr, align 1
+  %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> %old
+  ret <32 x i16>%res
+}
+
+; CHECK-LABEL: test8
+; CHECK: vmovdqu16{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) {
+  %mask = icmp ne <32 x i16> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <32 x i16>*
+  %r = load <32 x i16>* %vaddr, align 1
+  %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> zeroinitializer
+  ret <32 x i16>%res
+}
diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll
new file mode 100644
index 0000000..d2b1724
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: vpcmpeqb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind {
+  %mask = icmp eq <64 x i8> %x, %y
+  %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
+  ret <64 x i8> %max
+}
+
+; CHECK-LABEL: test2
+; CHECK: vpcmpgtb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y) nounwind {
+  %mask = icmp sgt <64 x i8> %x, %y
+  %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
+  ret <64 x i8> %max
+}
+
+; CHECK-LABEL: @test3
+; CHECK: vpcmplew {{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind {
+  %mask = icmp sge <32 x i16> %x, %y
+  %max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y
+  ret <32 x i16> %max
+}
+
+; CHECK-LABEL: test4
+; CHECK: vpcmpnleub {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y) nounwind {
+  %mask = icmp ugt <64 x i8> %x, %y
+  %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
+  ret <64 x i8> %max
+}
+
+; CHECK-LABEL: test5
+; CHECK: vpcmpeqw  (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwind {
+  %y = load <32 x i16>* %yp, align 4
+  %mask = icmp eq <32 x i16> %x, %y
+  %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+  ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test6
+; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+  %y = load <32 x i16>* %y.ptr, align 4
+  %mask = icmp sgt <32 x i16> %x, %y
+  %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+  ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test7
+; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+  %y = load <32 x i16>* %y.ptr, align 4
+  %mask = icmp sle <32 x i16> %x, %y
+  %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+  ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test8
+; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+  %y = load <32 x i16>* %y.ptr, align 4
+  %mask = icmp ule <32 x i16> %x, %y
+  %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+  ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test9
+; CHECK: vpcmpeqw %zmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16> %y1) nounwind {
+  %mask1 = icmp eq <32 x i16> %x1, %y1
+  %mask0 = icmp eq <32 x i16> %x, %y
+  %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
+  %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %y
+  ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test10
+; CHECK: vpcmpleb %zmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y1) nounwind {
+  %mask1 = icmp sge <64 x i8> %x1, %y1
+  %mask0 = icmp sle <64 x i8> %x, %y
+  %mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer
+  %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %x1
+  ret <64 x i8> %max
+}
+
+; CHECK-LABEL: @test11
+; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i8> %y1) nounwind {
+  %mask1 = icmp sgt <64 x i8> %x1, %y1
+  %y = load <64 x i8>* %y.ptr, align 4
+  %mask0 = icmp sgt <64 x i8> %x, %y
+  %mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer
+  %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %x1
+  ret <64 x i8> %max
+}
+
+; CHECK-LABEL: @test12
+; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32 x i16> %y1) nounwind {
+  %mask1 = icmp sge <32 x i16> %x1, %y1
+  %y = load <32 x i16>* %y.ptr, align 4
+  %mask0 = icmp ule <32 x i16> %x, %y
+  %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
+  %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+  ret <32 x i16> %max
+}
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
new file mode 100644
index 0000000..45f8d6d
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -0,0 +1,613 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; 256-bit
+
+define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_pcmpeq_b_256
+; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
+  ret i32 %res
+}
+
+define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_b_256
+; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_pcmpeq_w_256
+; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_w_256
+; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
+
+define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_pcmpgt_b_256
+; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
+  ret i32 %res
+}
+
+define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_b_256
+; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_pcmpgt_w_256
+; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_w_256
+; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
+
+define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK_LABEL: test_cmp_b_256
+; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
+  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ##
+  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ##
+  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ##
+  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ##
+  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ##
+  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ##
+  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ##
+  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
+; CHECK_LABEL: test_mask_cmp_b_256
+; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
+
+define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK_LABEL: test_ucmp_b_256
+; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ##
+  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ##
+  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ##
+  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ##
+  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ##
+  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ##
+  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ##
+  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ##
+  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
+; CHECK_LABEL: test_mask_ucmp_b_256
+; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
+
+define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK_LABEL: test_cmp_w_256
+; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ##
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmplew %ymm1, %ymm0, %k0 ##
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ##
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ##
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ##
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ##
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ##
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_cmp_w_256
+; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
+
+define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK_LABEL: test_ucmp_w_256
+; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ##
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ##
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ##
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ##
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ##
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ##
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ##
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ##
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_ucmp_w_256
+; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
+
+; 128-bit
+
+define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_pcmpeq_b_128
+; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_b_128
+; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16)
+
+define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_pcmpeq_w_128
+; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_w_128
+; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8)
+
+define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_pcmpgt_b_128
+; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_b_128
+; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16)
+
+define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_pcmpgt_w_128
+; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_w_128
+; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
+
+define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK_LABEL: test_cmp_b_128
+; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ##
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ##
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ##
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ##
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ##
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ##
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ##
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_cmp_b_128
+; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
+
+define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK_LABEL: test_ucmp_b_128
+; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ##
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ##
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ##
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ##
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ##
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ##
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ##
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ##
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_ucmp_b_128
+; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
+
+define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK_LABEL: test_cmp_w_128
+; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmplew %xmm1, %xmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_w_128
+; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK_LABEL: test_ucmp_w_128
+; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_w_128
+; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx512bwvl-mov.ll b/test/CodeGen/X86/avx512bwvl-mov.ll
new file mode 100644
index 0000000..835844f
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-mov.ll
@@ -0,0 +1,162 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; CHECK-LABEL: test_256_1
+; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <32 x i8> @test_256_1(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <32 x i8>*
+  %res = load <32 x i8>* %vaddr, align 1
+  ret <32 x i8>%res
+}
+
+; CHECK-LABEL: test_256_2
+; CHECK: vmovdqu8{{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_2(i8 * %addr, <32 x i8> %data) {
+  %vaddr = bitcast i8* %addr to <32 x i8>*
+  store <32 x i8>%data, <32 x i8>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_256_3
+; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62
+; CHECK: ret
+define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
+  %mask = icmp ne <32 x i8> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <32 x i8>*
+  %r = load <32 x i8>* %vaddr, align 1
+  %res = select <32 x i1> %mask, <32 x i8> %r, <32 x i8> %old
+  ret <32 x i8>%res
+}
+
+; CHECK-LABEL: test_256_4
+; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62
+; CHECK: ret
+define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) {
+  %mask = icmp ne <32 x i8> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <32 x i8>*
+  %r = load <32 x i8>* %vaddr, align 1
+  %res = select <32 x i1> %mask, <32 x i8> %r, <32 x i8> zeroinitializer
+  ret <32 x i8>%res
+}
+
+; CHECK-LABEL: test_256_5
+; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
+; CHECK: ret
+define <16 x i16> @test_256_5(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <16 x i16>*
+  %res = load <16 x i16>* %vaddr, align 1
+  ret <16 x i16>%res
+}
+
+; CHECK-LABEL: test_256_6
+; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_6(i8 * %addr, <16 x i16> %data) {
+  %vaddr = bitcast i8* %addr to <16 x i16>*
+  store <16 x i16>%data, <16 x i16>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_256_7
+; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62
+; CHECK: ret
+define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
+  %mask = icmp ne <16 x i16> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i16>*
+  %r = load <16 x i16>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i16> %r, <16 x i16> %old
+  ret <16 x i16>%res
+}
+
+; CHECK-LABEL: test_256_8
+; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62
+; CHECK: ret
+define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) {
+  %mask = icmp ne <16 x i16> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i16>*
+  %r = load <16 x i16>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i16> %r, <16 x i16> zeroinitializer
+  ret <16 x i16>%res
+}
+
+; CHECK-LABEL: test_128_1
+; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <16 x i8> @test_128_1(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <16 x i8>*
+  %res = load <16 x i8>* %vaddr, align 1
+  ret <16 x i8>%res
+}
+
+; CHECK-LABEL: test_128_2
+; CHECK: vmovdqu8{{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_2(i8 * %addr, <16 x i8> %data) {
+  %vaddr = bitcast i8* %addr to <16 x i8>*
+  store <16 x i8>%data, <16 x i8>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_128_3
+; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62
+; CHECK: ret
+define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
+  %mask = icmp ne <16 x i8> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i8>*
+  %r = load <16 x i8>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i8> %r, <16 x i8> %old
+  ret <16 x i8>%res
+}
+
+; CHECK-LABEL: test_128_4
+; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62
+; CHECK: ret
+define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) {
+  %mask = icmp ne <16 x i8> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i8>*
+  %r = load <16 x i8>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i8> %r, <16 x i8> zeroinitializer
+  ret <16 x i8>%res
+}
+
+; CHECK-LABEL: test_128_5
+; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
+; CHECK: ret
+define <8 x i16> @test_128_5(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x i16>*
+  %res = load <8 x i16>* %vaddr, align 1
+  ret <8 x i16>%res
+}
+
+; CHECK-LABEL: test_128_6
+; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_6(i8 * %addr, <8 x i16> %data) {
+  %vaddr = bitcast i8* %addr to <8 x i16>*
+  store <8 x i16>%data, <8 x i16>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_128_7
+; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62
+; CHECK: ret
+define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
+  %mask = icmp ne <8 x i16> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i16>*
+  %r = load <8 x i16>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i16> %r, <8 x i16> %old
+  ret <8 x i16>%res
+}
+
+; CHECK-LABEL: test_128_8
+; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62
+; CHECK: ret
+define <8 x i16> @test_128_8(i8 * %addr, <8 x i16> %mask1) {
+  %mask = icmp ne <8 x i16> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i16>*
+  %r = load <8 x i16>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i16> %r, <8 x i16> zeroinitializer
+  ret <8 x i16>%res
+}
+
diff --git a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
new file mode 100644
index 0000000..2d13a16
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
@@ -0,0 +1,269 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+; CHECK-LABEL: test256_1
+; CHECK: vpcmpeqb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind {
+  %mask = icmp eq <32 x i8> %x, %y
+  %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y
+  ret <32 x i8> %max
+}
+
+; CHECK-LABEL: test256_2
+; CHECK: vpcmpgtb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind {
+  %mask = icmp sgt <32 x i8> %x, %y
+  %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
+  ret <32 x i8> %max
+}
+
+; CHECK-LABEL: @test256_3
+; CHECK: vpcmplew {{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounwind {
+  %mask = icmp sge <16 x i16> %x, %y
+  %max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: test256_4
+; CHECK: vpcmpnleub {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind {
+  %mask = icmp ugt <32 x i8> %x, %y
+  %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
+  ret <32 x i8> %max
+}
+
+; CHECK-LABEL: test256_5
+; CHECK: vpcmpeqw  (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nounwind {
+  %y = load <16 x i16>* %yp, align 4
+  %mask = icmp eq <16 x i16> %x, %y
+  %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_6
+; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+  %y = load <16 x i16>* %y.ptr, align 4
+  %mask = icmp sgt <16 x i16> %x, %y
+  %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_7
+; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+  %y = load <16 x i16>* %y.ptr, align 4
+  %mask = icmp sle <16 x i16> %x, %y
+  %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_8
+; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+  %y = load <16 x i16>* %y.ptr, align 4
+  %mask = icmp ule <16 x i16> %x, %y
+  %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_9
+; CHECK: vpcmpeqw %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x i16> %y1) nounwind {
+  %mask1 = icmp eq <16 x i16> %x1, %y1
+  %mask0 = icmp eq <16 x i16> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_10
+; CHECK: vpcmpleb %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8> %y1) nounwind {
+  %mask1 = icmp sge <32 x i8> %x1, %y1
+  %mask0 = icmp sle <32 x i8> %x, %y
+  %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
+  %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
+  ret <32 x i8> %max
+}
+
+; CHECK-LABEL: @test256_11
+; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32 x i8> %y1) nounwind {
+  %mask1 = icmp sgt <32 x i8> %x1, %y1
+  %y = load <32 x i8>* %y.ptr, align 4
+  %mask0 = icmp sgt <32 x i8> %x, %y
+  %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
+  %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
+  ret <32 x i8> %max
+}
+
+; CHECK-LABEL: @test256_12
+; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1, <16 x i16> %y1) nounwind {
+  %mask1 = icmp sge <16 x i16> %x1, %y1
+  %y = load <16 x i16>* %y.ptr, align 4
+  %mask0 = icmp ule <16 x i16> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: test128_1
+; CHECK: vpcmpeqb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind {
+  %mask = icmp eq <16 x i8> %x, %y
+  %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y
+  ret <16 x i8> %max
+}
+
+; CHECK-LABEL: test128_2
+; CHECK: vpcmpgtb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind {
+  %mask = icmp sgt <16 x i8> %x, %y
+  %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
+  ret <16 x i8> %max
+}
+
+; CHECK-LABEL: @test128_3
+; CHECK: vpcmplew {{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind {
+  %mask = icmp sge <8 x i16> %x, %y
+  %max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y
+  ret <8 x i16> %max
+}
+
+; CHECK-LABEL: test128_4
+; CHECK: vpcmpnleub {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind {
+  %mask = icmp ugt <16 x i8> %x, %y
+  %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
+  ret <16 x i8> %max
+}
+
+; CHECK-LABEL: test128_5
+; CHECK: vpcmpeqw  (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwind {
+  %y = load <8 x i16>* %yp, align 4
+  %mask = icmp eq <8 x i16> %x, %y
+  %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+  ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_6
+; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+  %y = load <8 x i16>* %y.ptr, align 4
+  %mask = icmp sgt <8 x i16> %x, %y
+  %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+  ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_7
+; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+  %y = load <8 x i16>* %y.ptr, align 4
+  %mask = icmp sle <8 x i16> %x, %y
+  %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+  ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_8
+; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+  %y = load <8 x i16>* %y.ptr, align 4
+  %mask = icmp ule <8 x i16> %x, %y
+  %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+  ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_9
+; CHECK: vpcmpeqw %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16> %y1) nounwind {
+  %mask1 = icmp eq <8 x i16> %x1, %y1
+  %mask0 = icmp eq <8 x i16> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_10
+; CHECK: vpcmpleb %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8> %y1) nounwind {
+  %mask1 = icmp sge <16 x i8> %x1, %y1
+  %mask0 = icmp sle <16 x i8> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
+  ret <16 x i8> %max
+}
+
+; CHECK-LABEL: @test128_11
+; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16 x i8> %y1) nounwind {
+  %mask1 = icmp sgt <16 x i8> %x1, %y1
+  %y = load <16 x i8>* %y.ptr, align 4
+  %mask0 = icmp sgt <16 x i8> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
+  ret <16 x i8> %max
+}
+
+; CHECK-LABEL: @test128_12
+; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8 x i16> %y1) nounwind {
+  %mask1 = icmp sge <8 x i16> %x1, %y1
+  %y = load <8 x i16>* %y.ptr, align 4
+  %mask0 = icmp ule <8 x i16> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+  ret <8 x i16> %max
+}
diff --git a/test/CodeGen/X86/avx512dq-mask-op.ll b/test/CodeGen/X86/avx512dq-mask-op.ll
new file mode 100644
index 0000000..32a2633
--- /dev/null
+++ b/test/CodeGen/X86/avx512dq-mask-op.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+define i8 @mask8(i8 %x) {
+  %m0 = bitcast i8 %x to <8 x i1>
+  %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <8 x i1> %m1 to i8
+  ret i8 %ret
+; CHECK: mask8
+; CHECK: knotb
+; CHECK: ret
+}
+
+define void @mask8_mem(i8* %ptr) {
+  %x = load i8* %ptr, align 4
+  %m0 = bitcast i8 %x to <8 x i1>
+  %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <8 x i1> %m1 to i8
+  store i8 %ret, i8* %ptr, align 4
+  ret void
+; CHECK-LABEL: mask8_mem
+; CHECK: kmovb ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
+; CHECK-NEXT: knotb
+; CHECK-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]])
+; CHECK: ret
+}
+
+define i8 @mand8(i8 %x, i8 %y) {
+  %ma = bitcast i8 %x to <8 x i1>
+  %mb = bitcast i8 %y to <8 x i1>
+  %mc = and <8 x i1> %ma, %mb
+  %md = xor <8 x i1> %ma, %mb
+  %me = or <8 x i1> %mc, %md
+  %ret = bitcast <8 x i1> %me to i8
+; CHECK: kandb
+; CHECK: kxorb
+; CHECK: korb
+  ret i8 %ret
+}
diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll
new file mode 100644
index 0000000..0000ece
--- /dev/null
+++ b/test/CodeGen/X86/avx512er-intrinsics.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=knl --show-mc-encoding| FileCheck %s
+
+define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) {
+  ; CHECK: vrsqrt28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK: kmovw
+  ; CHECK: vrsqrt28ps %zmm0, %zmm1 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) {
+  ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) {
+  ; CHECK: kmovw
+  ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 6, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
+  ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8) 
+  ret <16 x float> %res
+}
+
+
+declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
+  ; CHECK: vrcp28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
+  ; CHECK: vrcp28pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
+  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) 
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
+
+define <16 x float> @test_exp2_ps_512(<16 x float> %a0) {
+  ; CHECK: vexp2ps %zmm0, %zmm0 {sae}      # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {
+  ; CHECK: vexp2pd %zmm0, %zmm0 {sae}      # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
+  %res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
+
+define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
+  ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
+  ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
new file mode 100644
index 0000000..fa19084
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -0,0 +1,613 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; 256-bit
+
+define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d_256
+; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d_256
+; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q_256
+; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q_256
+; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8)
+
+define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d_256
+; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d_256
+; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q_256
+; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q_256
+; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
+
+define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK_LABEL: test_cmp_d_256
+; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpled %ymm1, %ymm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_d_256
+; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK_LABEL: test_ucmp_d_256
+; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_d_256
+; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK_LABEL: test_cmp_q_256
+; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_q_256
+; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK_LABEL: test_ucmp_q_256
+; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_q_256
+; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
+
+; 128-bit
+
+define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d_128
+; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d_128
+; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8)
+
+define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q_128
+; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q_128
+; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8)
+
+define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d_128
+; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d_128
+; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8)
+
+define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q_128
+; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q_128
+; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
+
+define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK_LABEL: test_cmp_d_128
+; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpled %xmm1, %xmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_d_128
+; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK_LABEL: test_ucmp_d_128
+; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_d_128
+; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK_LABEL: test_cmp_q_128
+; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_q_128
+; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK_LABEL: test_ucmp_q_128
+; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_q_128
+; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx512vl-mov.ll b/test/CodeGen/X86/avx512vl-mov.ll
new file mode 100644
index 0000000..3224656
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-mov.ll
@@ -0,0 +1,642 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; CHECK-LABEL: test_256_1
+; CHECK: vmovdqu32
+; CHECK: ret
+define <8 x i32> @test_256_1(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  %res = load <8 x i32>* %vaddr, align 1
+  ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_2
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test_256_2(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  %res = load <8 x i32>* %vaddr, align 32
+  ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_3
+; CHECK: vmovdqa64
+; CHECK: ret
+define void @test_256_3(i8 * %addr, <4 x i64> %data) {
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  store <4 x i64>%data, <4 x i64>* %vaddr, align 32
+  ret void
+}
+
+; CHECK-LABEL: test_256_4
+; CHECK: vmovdqu32
+; CHECK: ret
+define void @test_256_4(i8 * %addr, <8 x i32> %data) {
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  store <8 x i32>%data, <8 x i32>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_256_5
+; CHECK: vmovdqa32
+; CHECK: ret
+define void @test_256_5(i8 * %addr, <8 x i32> %data) {
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  store <8 x i32>%data, <8 x i32>* %vaddr, align 32
+  ret void
+}
+
+; CHECK-LABEL: test_256_6
+; CHECK: vmovdqa64
+; CHECK: ret
+define  <4 x i64> @test_256_6(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  %res = load <4 x i64>* %vaddr, align 32
+  ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_7
+; CHECK: vmovdqu64
+; CHECK: ret
+define void @test_256_7(i8 * %addr, <4 x i64> %data) {
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  store <4 x i64>%data, <4 x i64>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_256_8
+; CHECK: vmovdqu64
+; CHECK: ret
+define <4 x i64> @test_256_8(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  %res = load <4 x i64>* %vaddr, align 1
+  ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_9
+; CHECK: vmovapd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_9(i8 * %addr, <4 x double> %data) {
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  store <4 x double>%data, <4 x double>* %vaddr, align 32
+  ret void
+}
+
+; CHECK-LABEL: test_256_10
+; CHECK: vmovapd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <4 x double> @test_256_10(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  %res = load <4 x double>* %vaddr, align 32
+  ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_11
+; CHECK: vmovaps {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_11(i8 * %addr, <8 x float> %data) {
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  store <8 x float>%data, <8 x float>* %vaddr, align 32
+  ret void
+}
+
+; CHECK-LABEL: test_256_12
+; CHECK: vmovaps {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <8 x float> @test_256_12(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  %res = load <8 x float>* %vaddr, align 32
+  ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_13
+; CHECK: vmovupd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_13(i8 * %addr, <4 x double> %data) {
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  store <4 x double>%data, <4 x double>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_256_14
+; CHECK: vmovupd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <4 x double> @test_256_14(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  %res = load <4 x double>* %vaddr, align 1
+  ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_15
+; CHECK: vmovups {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_15(i8 * %addr, <8 x float> %data) {
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  store <8 x float>%data, <8 x float>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_256_16
+; CHECK: vmovups {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <8 x float> @test_256_16(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  %res = load <8 x float>* %vaddr, align 1
+  ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_17
+; CHECK: vmovdqa32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  %r = load <8 x i32>* %vaddr, align 32
+  %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old
+  ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_18
+; CHECK: vmovdqu32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  %r = load <8 x i32>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old
+  ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_19
+; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  %r = load <8 x i32>* %vaddr, align 32
+  %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer
+  ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_20
+; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  %r = load <8 x i32>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer
+  ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_21
+; CHECK: vmovdqa64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  %r = load <4 x i64>* %vaddr, align 32
+  %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old
+  ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_22
+; CHECK: vmovdqu64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  %r = load <4 x i64>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old
+  ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_23
+; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  %r = load <4 x i64>* %vaddr, align 32
+  %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer
+  ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_24
+; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  %r = load <4 x i64>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer
+  ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_25
+; CHECK: vmovaps{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1) {
+  %mask = fcmp one <8 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  %r = load <8 x float>* %vaddr, align 32
+  %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> %old
+  ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_26
+; CHECK: vmovups{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1) {
+  %mask = fcmp one <8 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  %r = load <8 x float>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> %old
+  ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_27
+; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x float> @test_256_27(i8 * %addr, <8 x float> %mask1) {
+  %mask = fcmp one <8 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  %r = load <8 x float>* %vaddr, align 32
+  %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> zeroinitializer
+  ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_28
+; CHECK: vmovups{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) {
+  %mask = fcmp one <8 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  %r = load <8 x float>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> zeroinitializer
+  ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_29
+; CHECK: vmovapd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  %r = load <4 x double>* %vaddr, align 32
+  %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> %old
+  ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_30
+; CHECK: vmovupd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  %r = load <4 x double>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> %old
+  ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_31
+; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  %r = load <4 x double>* %vaddr, align 32
+  %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> zeroinitializer
+  ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_32
+; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  %r = load <4 x double>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> zeroinitializer
+  ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_128_1
+; CHECK: vmovdqu32
+; CHECK: ret
+define <4 x i32> @test_128_1(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  %res = load <4 x i32>* %vaddr, align 1
+  ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_2
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test_128_2(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  %res = load <4 x i32>* %vaddr, align 16
+  ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_3
+; CHECK: vmovdqa64
+; CHECK: ret
+define void @test_128_3(i8 * %addr, <2 x i64> %data) {
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  store <2 x i64>%data, <2 x i64>* %vaddr, align 16
+  ret void
+}
+
+; CHECK-LABEL: test_128_4
+; CHECK: vmovdqu32
+; CHECK: ret
+define void @test_128_4(i8 * %addr, <4 x i32> %data) {
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  store <4 x i32>%data, <4 x i32>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_128_5
+; CHECK: vmovdqa32
+; CHECK: ret
+define void @test_128_5(i8 * %addr, <4 x i32> %data) {
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  store <4 x i32>%data, <4 x i32>* %vaddr, align 16
+  ret void
+}
+
+; CHECK-LABEL: test_128_6
+; CHECK: vmovdqa64
+; CHECK: ret
+define  <2 x i64> @test_128_6(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  %res = load <2 x i64>* %vaddr, align 16
+  ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_7
+; CHECK: vmovdqu64
+; CHECK: ret
+define void @test_128_7(i8 * %addr, <2 x i64> %data) {
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  store <2 x i64>%data, <2 x i64>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_128_8
+; CHECK: vmovdqu64
+; CHECK: ret
+define <2 x i64> @test_128_8(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  %res = load <2 x i64>* %vaddr, align 1
+  ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_9
+; CHECK: vmovapd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_9(i8 * %addr, <2 x double> %data) {
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  store <2 x double>%data, <2 x double>* %vaddr, align 16
+  ret void
+}
+
+; CHECK-LABEL: test_128_10
+; CHECK: vmovapd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <2 x double> @test_128_10(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  %res = load <2 x double>* %vaddr, align 16
+  ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_11
+; CHECK: vmovaps {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_11(i8 * %addr, <4 x float> %data) {
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  store <4 x float>%data, <4 x float>* %vaddr, align 16
+  ret void
+}
+
+; CHECK-LABEL: test_128_12
+; CHECK: vmovaps {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <4 x float> @test_128_12(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  %res = load <4 x float>* %vaddr, align 16
+  ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_13
+; CHECK: vmovupd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_13(i8 * %addr, <2 x double> %data) {
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  store <2 x double>%data, <2 x double>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_128_14
+; CHECK: vmovupd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <2 x double> @test_128_14(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  %res = load <2 x double>* %vaddr, align 1
+  ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_15
+; CHECK: vmovups {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_15(i8 * %addr, <4 x float> %data) {
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  store <4 x float>%data, <4 x float>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_128_16
+; CHECK: vmovups {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <4 x float> @test_128_16(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  %res = load <4 x float>* %vaddr, align 1
+  ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_17
+; CHECK: vmovdqa32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  %r = load <4 x i32>* %vaddr, align 16
+  %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old
+  ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_18
+; CHECK: vmovdqu32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  %r = load <4 x i32>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old
+  ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_19
+; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  %r = load <4 x i32>* %vaddr, align 16
+  %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer
+  ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_20
+; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  %r = load <4 x i32>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer
+  ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_21
+; CHECK: vmovdqa64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  %r = load <2 x i64>* %vaddr, align 16
+  %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old
+  ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_22
+; CHECK: vmovdqu64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  %r = load <2 x i64>* %vaddr, align 1
+  %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old
+  ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_23
+; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  %r = load <2 x i64>* %vaddr, align 16
+  %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer
+  ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_24
+; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  %r = load <2 x i64>* %vaddr, align 1
+  %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer
+  ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_25
+; CHECK: vmovaps{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  %r = load <4 x float>* %vaddr, align 16
+  %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> %old
+  ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_26
+; CHECK: vmovups{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  %r = load <4 x float>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> %old
+  ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_27
+; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  %r = load <4 x float>* %vaddr, align 16
+  %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> zeroinitializer
+  ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_28
+; CHECK: vmovups{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  %r = load <4 x float>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> zeroinitializer
+  ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_29
+; CHECK: vmovapd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  %r = load <2 x double>* %vaddr, align 16
+  %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> %old
+  ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_30
+; CHECK: vmovupd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  %r = load <2 x double>* %vaddr, align 1
+  %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> %old
+  ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_31
+; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  %r = load <2 x double>* %vaddr, align 16
+  %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> zeroinitializer
+  ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_32
+; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <2 x double> @test_128_32(i8 * %addr, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  %r = load <2 x double>* %vaddr, align 1
+  %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> zeroinitializer
+  ret <2 x double>%res
+}
+
diff --git a/test/CodeGen/X86/avx512vl-nontemporal.ll b/test/CodeGen/X86/avx512vl-nontemporal.ll
new file mode 100644
index 0000000..2ad9768
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-nontemporal.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s  -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s
+
+define void @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE) {
+; CHECK: vmovntps %ymm{{.*}} ## encoding: [0x62
+  %cast = bitcast i8* %B to <8 x float>*
+  %A2 = fadd <8 x float> %A, %AA
+  store <8 x float> %A2, <8 x float>* %cast, align 64, !nontemporal !0
+; CHECK: vmovntdq %ymm{{.*}} ## encoding: [0x62
+  %cast1 = bitcast i8* %B to <4 x i64>*
+  %E2 = add <4 x i64> %E, %EE
+  store <4 x i64> %E2, <4 x i64>* %cast1, align 64, !nontemporal !0
+; CHECK: vmovntpd %ymm{{.*}} ## encoding: [0x62
+  %cast2 = bitcast i8* %B to <4 x double>*
+  %C2 = fadd <4 x double> %C, %CC
+  store <4 x double> %C2, <4 x double>* %cast2, align 64, !nontemporal !0
+  ret void
+}
+
+define void @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE) {
+; CHECK: vmovntps %xmm{{.*}} ## encoding: [0x62
+  %cast = bitcast i8* %B to <4 x float>*
+  %A2 = fadd <4 x float> %A, %AA
+  store <4 x float> %A2, <4 x float>* %cast, align 64, !nontemporal !0
+; CHECK: vmovntdq %xmm{{.*}} ## encoding: [0x62
+  %cast1 = bitcast i8* %B to <2 x i64>*
+  %E2 = add <2 x i64> %E, %EE
+  store <2 x i64> %E2, <2 x i64>* %cast1, align 64, !nontemporal !0
+; CHECK: vmovntpd %xmm{{.*}} ## encoding: [0x62
+  %cast2 = bitcast i8* %B to <2 x double>*
+  %C2 = fadd <2 x double> %C, %CC
+  store <2 x double> %C2, <2 x double>* %cast2, align 64, !nontemporal !0
+  ret void
+}
+!0 = metadata !{i32 1}
diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll
new file mode 100644
index 0000000..9c64c03
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll
@@ -0,0 +1,381 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+; CHECK-LABEL: test256_1
+; CHECK: vpcmpeqq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind {
+  %mask = icmp eq <4 x i64> %x, %y
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: test256_2
+; CHECK: vpcmpgtq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y) nounwind {
+  %mask = icmp sgt <4 x i64> %x, %y
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: @test256_3
+; CHECK: vpcmpled {{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind {
+  %mask = icmp sge <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test256_4
+; CHECK: vpcmpnleuq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y) nounwind {
+  %mask = icmp ugt <4 x i64> %x, %y
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: test256_5
+; CHECK: vpcmpeqd  (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
+  %y = load <8 x i32>* %yp, align 4
+  %mask = icmp eq <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_6
+; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+  %y = load <8 x i32>* %y.ptr, align 4
+  %mask = icmp sgt <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_7
+; CHECK: vpcmpled (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+  %y = load <8 x i32>* %y.ptr, align 4
+  %mask = icmp sle <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_8
+; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+  %y = load <8 x i32>* %y.ptr, align 4
+  %mask = icmp ule <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_9
+; CHECK: vpcmpeqd %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+  %mask1 = icmp eq <8 x i32> %x1, %y1
+  %mask0 = icmp eq <8 x i32> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_10
+; CHECK: vpcmpleq %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+  %mask1 = icmp sge <4 x i64> %x1, %y1
+  %mask0 = icmp sle <4 x i64> %x, %y
+  %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: @test256_11
+; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+  %mask1 = icmp sgt <4 x i64> %x1, %y1
+  %y = load <4 x i64>* %y.ptr, align 4
+  %mask0 = icmp sgt <4 x i64> %x, %y
+  %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: @test256_12
+; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+  %mask1 = icmp sge <8 x i32> %x1, %y1
+  %y = load <8 x i32>* %y.ptr, align 4
+  %mask0 = icmp ule <8 x i32> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test256_13
+; CHECK: vpcmpeqq  (%rdi){1to4}, %ymm
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind {
+  %yb = load i64* %yb.ptr, align 4
+  %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
+  %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer
+  %mask = icmp eq <4 x i64> %x, %y
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: test256_14
+; CHECK: vpcmpled  (%rdi){1to8}, %ymm
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind {
+  %yb = load i32* %yb.ptr, align 4
+  %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
+  %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %mask = icmp sle <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test256_15
+; CHECK: vpcmpgtd  (%rdi){1to8}, %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+  %mask1 = icmp sge <8 x i32> %x1, %y1
+  %yb = load i32* %yb.ptr, align 4
+  %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
+  %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %mask0 = icmp sgt <8 x i32> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test256_16
+; CHECK: vpcmpgtq  (%rdi){1to4}, %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+  %mask1 = icmp sge <4 x i64> %x1, %y1
+  %yb = load i64* %yb.ptr, align 4
+  %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
+  %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer
+  %mask0 = icmp sgt <4 x i64> %x, %y
+  %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: test128_1
+; CHECK: vpcmpeqq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind {
+  %mask = icmp eq <2 x i64> %x, %y
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %max
+}
+
+; CHECK-LABEL: test128_2
+; CHECK: vpcmpgtq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y) nounwind {
+  %mask = icmp sgt <2 x i64> %x, %y
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %max
+}
+
+; CHECK-LABEL: @test128_3
+; CHECK: vpcmpled {{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind {
+  %mask = icmp sge <4 x i32> %x, %y
+  %max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: test128_4
+; CHECK: vpcmpnleuq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y) nounwind {
+  %mask = icmp ugt <2 x i64> %x, %y
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %max
+}
+
+; CHECK-LABEL: test128_5
+; CHECK: vpcmpeqd  (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind {
+  %y = load <4 x i32>* %yp, align 4
+  %mask = icmp eq <4 x i32> %x, %y
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_6
+; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+  %y = load <4 x i32>* %y.ptr, align 4
+  %mask = icmp sgt <4 x i32> %x, %y
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_7
+; CHECK: vpcmpled (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+  %y = load <4 x i32>* %y.ptr, align 4
+  %mask = icmp sle <4 x i32> %x, %y
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_8
+; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+  %y = load <4 x i32>* %y.ptr, align 4
+  %mask = icmp ule <4 x i32> %x, %y
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_9
+; CHECK: vpcmpeqd %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+  %mask1 = icmp eq <4 x i32> %x1, %y1
+  %mask0 = icmp eq <4 x i32> %x, %y
+  %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_10
+; CHECK: vpcmpleq %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+  %mask1 = icmp sge <2 x i64> %x1, %y1
+  %mask0 = icmp sle <2 x i64> %x, %y
+  %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1
+  ret <2 x i64> %max
+}
+
+; CHECK-LABEL: @test128_11
+; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+  %mask1 = icmp sgt <2 x i64> %x1, %y1
+  %y = load <2 x i64>* %y.ptr, align 4
+  %mask0 = icmp sgt <2 x i64> %x, %y
+  %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1
+  ret <2 x i64> %max
+}
+
+; CHECK-LABEL: @test128_12
+; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+  %mask1 = icmp sge <4 x i32> %x1, %y1
+  %y = load <4 x i32>* %y.ptr, align 4
+  %mask0 = icmp ule <4 x i32> %x, %y
+  %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: test128_13
+; CHECK: vpcmpeqq  (%rdi){1to2}, %xmm
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind {
+  %yb = load i64* %yb.ptr, align 4
+  %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
+  %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1
+  %mask = icmp eq <2 x i64> %x, %y
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1
+  ret <2 x i64> %max
+}
+
+; CHECK-LABEL: test128_14
+; CHECK: vpcmpled  (%rdi){1to4}, %xmm
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind {
+  %yb = load i32* %yb.ptr, align 4
+  %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
+  %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mask = icmp sle <4 x i32> %x, %y
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: test128_15
+; CHECK: vpcmpgtd  (%rdi){1to4}, %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+  %mask1 = icmp sge <4 x i32> %x1, %y1
+  %yb = load i32* %yb.ptr, align 4
+  %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
+  %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mask0 = icmp sgt <4 x i32> %x, %y
+  %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: test128_16
+; CHECK: vpcmpgtq  (%rdi){1to2}, %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+  %mask1 = icmp sge <2 x i64> %x1, %y1
+  %yb = load i64* %yb.ptr, align 4
+  %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
+  %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1
+  %mask0 = icmp sgt <2 x i64> %x, %y
+  %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1
+  ret <2 x i64> %max
+}
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
deleted file mode 100644
index 34aaf2c..0000000
--- a/test/CodeGen/X86/blend-msb.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
-
-
-; Verify that we produce movss instead of blendvps when possible.
-
-;CHECK-LABEL: vsel_float:
-;CHECK-NOT: blend
-;CHECK: movss
-;CHECK: ret
-define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
-  ret <4 x float> %vsel
-}
-
-;CHECK-LABEL: vsel_4xi8:
-;CHECK-NOT: blend
-;CHECK: movss
-;CHECK: ret
-define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
-  ret <4 x i8> %vsel
-}
-
-;CHECK-LABEL: vsel_8xi16:
-; The select mask is
-; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
-; which translates into the boolean mask (big endian representation):
-; 00010001 = 17.
-; '1' means takes the first argument, '0' means takes the second argument.
-; This is the opposite of the intel syntax, thus we expect
-; the inverted mask: 11101110 = 238.
-; According to the ABI:
-; v1 is in xmm0 => first argument is xmm0.
-; v2 is in xmm1 => second argument is xmm1.
-;CHECK: pblendw $238, %xmm1, %xmm0
-;CHECK: ret
-define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
-  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
-  ret <8 x i16> %vsel
-}
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index 2681c10..cc40bcf 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -237,44 +237,6 @@ exit:
   ret i32 %base
 }
 
-define void @test_loop_rotate_reversed_blocks() {
-; This test case (greatly reduced from an Olden bencmark) ensures that the loop
-; rotate implementation doesn't assume that loops are laid out in a particular
-; order. The first loop will get split into two basic blocks, with the loop
-; header coming after the loop latch.
-;
-; CHECK: test_loop_rotate_reversed_blocks
-; CHECK: %entry
-; Look for a jump into the middle of the loop, and no branches mid-way.
-; CHECK: jmp
-; CHECK: %loop1
-; CHECK-NOT: j{{\w*}} .LBB{{.*}}
-; CHECK: %loop1
-; CHECK: je
-
-entry:
-  %cond1 = load volatile i1* undef
-  br i1 %cond1, label %loop2.preheader, label %loop1
-
-loop1:
-  call i32 @f()
-  %cond2 = load volatile i1* undef
-  br i1 %cond2, label %loop2.preheader, label %loop1
-
-loop2.preheader:
-  call i32 @f()
-  %cond3 = load volatile i1* undef
-  br i1 %cond3, label %exit, label %loop2
-
-loop2:
-  call i32 @f()
-  %cond4 = load volatile i1* undef
-  br i1 %cond4, label %exit, label %loop2
-
-exit:
-  ret void
-}
-
 define i32 @test_loop_align(i32 %i, i32* %a) {
 ; Check that we provide basic loop body alignment with the block placement
 ; pass.
diff --git a/test/CodeGen/X86/byval-callee-cleanup.ll b/test/CodeGen/X86/byval-callee-cleanup.ll
new file mode 100644
index 0000000..8e059d4
--- /dev/null
+++ b/test/CodeGen/X86/byval-callee-cleanup.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s
+
+; Previously we would forget to align to stack slot alignment after placing a
+; byval argument.  Subsequent arguments would align themselves, but if it was
+; the last argument, the argument size would not be a multiple of stack slot
+; size. This resulted in retl $6 in callee-cleanup functions, as well as subtle
+; varargs bugs.
+
+%struct.Six = type { [6 x i8] }
+
+define x86_stdcallcc void @f(%struct.Six* byval %a) {
+  ret void
+}
+; CHECK-LABEL: _f@8:
+; CHECK: retl $8
+
+define x86_thiscallcc void @g(i8* %this, %struct.Six* byval %a) {
+  ret void
+}
+; CHECK-LABEL: _g:
+; CHECK: retl $8
+
+define x86_fastcallcc void @h(i32 inreg %x, i32 inreg %y, %struct.Six* byval %a) {
+  ret void
+}
+; CHECK-LABEL: @h@16:
+; CHECK: retl $8
diff --git a/test/CodeGen/X86/cfi_enforcing.ll b/test/CodeGen/X86/cfi_enforcing.ll
new file mode 100644
index 0000000..bcad8c1
--- /dev/null
+++ b/test/CodeGen/X86/cfi_enforcing.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=i386-unknown-linux-gnu -fcfi -cfi-enforcing <%s | FileCheck --check-prefix=X86 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -fcfi -cfi-enforcing <%s | FileCheck --check-prefix=X86-64 %s
+
+define void @indirect_fun() unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @m(void ()* %fun) {
+  call void ()* %fun()
+; CHECK: subl
+; X86-64: andq    $8,
+; X86-64: leaq    __llvm_jump_instr_table_0_1({{%[a-z0-9]+}}), [[REG:%[a-z0-9]+]]
+; X86-64-NOT: callq __llvm_cfi_pointer_warning
+; X86-64: callq   *[[REG]]
+; X86: andl    $8,
+; X86: leal    __llvm_jump_instr_table_0_1({{%[a-z0-9]+}}), [[REG:%[a-z0-9]+]]
+; X86-NOT: calll __llvm_cfi_pointer_warning
+; X86: calll   *[[REG]]
+  ret i32 0
+}
+
+define void ()* @get_fun() {
+  ret void ()* @indirect_fun
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+  %f = call void ()* ()* @get_fun()
+  %a = call i32 @m(void ()* %f)
+  ret i32 %a
+}
+
+; CHECK: .align 8
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK: jmp indirect_fun@PLT
diff --git a/test/CodeGen/X86/cfi_invoke.ll b/test/CodeGen/X86/cfi_invoke.ll
new file mode 100644
index 0000000..dd0d42a
--- /dev/null
+++ b/test/CodeGen/X86/cfi_invoke.ll
@@ -0,0 +1,35 @@
+; RUN: llc <%s -fcfi -cfi-type=sub | FileCheck %s
+; ModuleID = 'test.cc'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @__gxx_personality_v0(...)
+
+@_ZTIPKc = external constant i8*
+@_ZTIi = external constant i8*
+
+define void @f() unnamed_addr jumptable {
+  ret void
+}
+
+@a = global void ()* @f
+
+; Make sure invoke gets targeted as well as regular calls
+define void @_Z3foov(void ()* %f) uwtable ssp {
+; CHECK-LABEL: _Z3foov:
+ entry:
+   invoke void %f()
+           to label %try.cont unwind label %lpad
+; CHECK: callq __llvm_cfi_pointer_warning
+; CHECK: callq *%rbx
+
+ lpad:
+   %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+                                  catch i8* bitcast (i8** @_ZTIi to i8*)
+                                  filter [1 x i8*] [i8* bitcast (i8** @_ZTIPKc to i8*)]
+   ret void
+
+ try.cont:
+   ret void
+}
+
diff --git a/test/CodeGen/X86/cfi_non_default_function.ll b/test/CodeGen/X86/cfi_non_default_function.ll
new file mode 100644
index 0000000..29774a1
--- /dev/null
+++ b/test/CodeGen/X86/cfi_non_default_function.ll
@@ -0,0 +1,27 @@
+; RUN: llc -fcfi -cfi-func-name=cfi_new_failure <%s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+define void @indirect_fun() unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @m(void ()* %fun) {
+; CHECK-LABEL: @m
+  call void ()* %fun()
+; CHECK: callq cfi_new_failure
+  ret i32 0
+}
+
+define void ()* @get_fun() {
+  ret void ()* @indirect_fun
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+  %f = call void ()* ()* @get_fun()
+  %a = call i32 @m(void ()* %f)
+  ret i32 %a
+}
+
+; CHECK: .align 8
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK: jmp indirect_fun@PLT
diff --git a/test/CodeGen/X86/cfi_simple_indirect_call.ll b/test/CodeGen/X86/cfi_simple_indirect_call.ll
new file mode 100644
index 0000000..0ee118d
--- /dev/null
+++ b/test/CodeGen/X86/cfi_simple_indirect_call.ll
@@ -0,0 +1,43 @@
+; RUN: llc -fcfi -cfi-type=sub <%s | FileCheck --check-prefix=SUB %s
+; RUN: llc -fcfi -cfi-type=add <%s | FileCheck --check-prefix=ADD %s
+; RUN: llc -fcfi -cfi-type=ror <%s | FileCheck --check-prefix=ROR %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @indirect_fun() unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @m(void ()* %fun) {
+  call void ()* %fun()
+; SUB: subl    
+; SUB: andq    $8
+; SUB-LABEL: leaq    __llvm_jump_instr_table_0_1
+; SUB-LABEL: callq   __llvm_cfi_pointer_warning
+
+; ROR: subq
+; ROR: rolq    $61
+; ROR: testq
+; ROR-LABEL: callq   __llvm_cfi_pointer_warning
+
+; ADD: andq    $8
+; ADD-LABEL: leaq    __llvm_jump_instr_table_0_1
+; ADD: cmpq
+; ADD-LABEL: callq   __llvm_cfi_pointer_warning
+ret i32 0
+}
+
+define void ()* @get_fun() {
+  ret void ()* @indirect_fun
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+  %f = call void ()* ()* @get_fun()
+  %a = call i32 @m(void ()* %f)
+  ret i32 %a
+}
+; SUB: .text
+; SUB: .align 8
+; SUB-LABEL: .type __llvm_jump_instr_table_0_1,@function
+; SUB-LABEL:__llvm_jump_instr_table_0_1:
+; SUB-LABEL: jmp indirect_fun@PLT
diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
new file mode 100644
index 0000000..3cb8b97
--- /dev/null
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -0,0 +1,86 @@
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s
+
+declare i32 @bar()
+
+define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
+; CHECK-LABEL: test_intervening_call:
+; CHECK: cmpxchg
+; CHECK: pushfq
+; CHECK: popq [[FLAGS:%.*]]
+
+; CHECK: callq bar
+
+; CHECK: pushq [[FLAGS]]
+; CHECK: popfq
+; CHECK: jne
+  %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
+  %p = extractvalue { i64, i1 } %cx, 1
+  call i32 @bar()
+  br i1 %p, label %t, label %f
+
+t:
+  ret i64 42
+
+f:
+  ret i64 0
+}
+
+; Interesting in producing a clobber without any function calls.
+define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) {
+; CHECK-LABEL: test_control_flow:
+
+; CHECK: cmpxchg
+; CHECK-NEXT: jne
+entry:
+  %cmp = icmp sgt i32 %i, %j
+  br i1 %cmp, label %loop_start, label %cond.end
+
+loop_start:
+  br label %while.condthread-pre-split.i
+
+while.condthread-pre-split.i:
+  %.pr.i = load i32* %p, align 4
+  br label %while.cond.i
+
+while.cond.i:
+  %0 = phi i32 [ %.pr.i, %while.condthread-pre-split.i ], [ 0, %while.cond.i ]
+  %tobool.i = icmp eq i32 %0, 0
+  br i1 %tobool.i, label %while.cond.i, label %while.body.i
+
+while.body.i:
+  %.lcssa = phi i32 [ %0, %while.cond.i ]
+  %1 = cmpxchg i32* %p, i32 %.lcssa, i32 %.lcssa seq_cst seq_cst
+  %2 = extractvalue { i32, i1 } %1, 1
+  br i1 %2, label %cond.end.loopexit, label %while.condthread-pre-split.i
+
+cond.end.loopexit:
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ %i, %entry ], [ 0, %cond.end.loopexit ]
+  ret i32 %cond
+}
+
+; This one is an interesting case because CMOV doesn't have a chain
+; operand. Naive attempts to limit cmpxchg EFLAGS use are likely to fail here.
+define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: test_feed_cmov:
+
+; CHECK: cmpxchg
+; CHECK: pushfq
+; CHECK: popq [[FLAGS:%.*]]
+
+; CHECK: callq bar
+
+; CHECK: pushq [[FLAGS]]
+; CHECK: popfq
+
+  %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %res, 1
+
+  %rhs = call i32 @bar()
+
+  %ret = select i1 %success, i32 %new, i32 %rhs
+  ret i32 %ret
+}
diff --git a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
index 78e1dd2..85bfff2 100644
--- a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -codegenprepare %s -o - | FileCheck %s
 ; RUN: opt -S -codegenprepare -addr-sink-using-gep=1 %s -o - | FileCheck -check-prefix=CHECK-GEP %s
 ; This file tests the different cases what are involved when codegen prepare
-; tries to get sign extension out of the way of addressing mode.
+; tries to get sign/zero extension out of the way of addressing mode.
 ; This tests require an actual target as addressing mode decisions depends
 ; on the target.
 
@@ -67,6 +67,43 @@ define i8 @oneArgPromotion(i32 %arg1, i8* %base) {
   ret i8 %res
 }
 
+; Check that we are able to merge a sign extension with a zero extension.
+; CHECK-LABEL: @oneArgPromotionZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionZExt(i8 %arg1, i8* %base) {
+  %zext = zext i8 %arg1 to i32
+  %add = add nsw i32 %zext, 1 
+  %sextadd = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; When promoting a constant zext, the IR builder returns a constant,
+; not an instruction. Make sure this is properly handled. This used
+; to crash.
+; Note: The constant zext is promoted, but does not help matching
+; more thing in the addressing mode. Therefore the modification is
+; rolled back.
+; Still, this test case exercises the desired code path.
+; CHECK-LABEL: @oneArgPromotionCstZExt
+; CHECK: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i16 undef to i32
+; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXT]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionCstZExt(i8* %base) {
+  %cst = zext i16 undef to i32
+  %add = add nsw i32 %cst, 1
+  %sextadd = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
 ; Check that we do not promote truncate when we cannot determine the
 ; bits that are dropped.
 ; CHECK-LABEL: @oneArgPromotionBlockTrunc1
@@ -321,3 +358,177 @@ end:
   %final = load i32* %addr
   ret i32 %final
 }
+
+%struct.dns_packet = type { i32, i32, %union.anon }
+%union.anon = type { i32 }
+
+@a = common global i32 0, align 4
+@b = common global i16 0, align 2
+
+; We used to crash on this function because we did not return the right
+; promoted instruction for %conv.i.
+; Make sure we generate the right code now.
+; CHECK-LABEL: @fn3
+; %conv.i is used twice and only one of its use is being promoted.
+; Use it at the starting point for the matching.
+; CHECK: %conv.i = zext i16 [[PLAIN_OPND:%[.a-zA-Z_0-9-]+]] to i32
+; CHECK-NEXT: [[PROMOTED_CONV:%[.a-zA-Z_0-9-]+]] = zext i16 [[PLAIN_OPND]] to i64
+; CHECK-NEXT: [[BASE:%[a-zA-Z_0-9-]+]] = ptrtoint %struct.dns_packet* %P to i64
+; CHECK-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add i64 [[BASE]], [[PROMOTED_CONV]]
+; CHECK-NEXT: [[ADDR:%[a-zA-Z_0-9-]+]] = add i64 [[ADD]], 7
+; CHECK-NEXT: [[CAST:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[ADDR]] to i8*
+; CHECK-NEXT: load i8* [[CAST]], align 1
+define signext i16 @fn3(%struct.dns_packet* nocapture readonly %P) {
+entry:
+  %tmp = getelementptr inbounds %struct.dns_packet* %P, i64 0, i32 2
+  %data.i.i = bitcast %union.anon* %tmp to [0 x i8]*
+  br label %while.body.i.i
+
+while.body.i.i:                                   ; preds = %while.body.i.i, %entry
+  %src.addr.0.i.i = phi i16 [ 0, %entry ], [ %inc.i.i, %while.body.i.i ]
+  %inc.i.i = add i16 %src.addr.0.i.i, 1
+  %idxprom.i.i = sext i16 %src.addr.0.i.i to i64
+  %arrayidx.i.i = getelementptr inbounds [0 x i8]* %data.i.i, i64 0, i64 %idxprom.i.i
+  %tmp1 = load i8* %arrayidx.i.i, align 1
+  %conv2.i.i = zext i8 %tmp1 to i32
+  %and.i.i = and i32 %conv2.i.i, 15
+  store i32 %and.i.i, i32* @a, align 4
+  %tobool.i.i = icmp eq i32 %and.i.i, 0
+  br i1 %tobool.i.i, label %while.body.i.i, label %fn1.exit.i
+
+fn1.exit.i:                                       ; preds = %while.body.i.i
+  %inc.i.i.lcssa = phi i16 [ %inc.i.i, %while.body.i.i ]
+  %conv.i = zext i16 %inc.i.i.lcssa to i32
+  %sub.i = add nsw i32 %conv.i, -1
+  %idxprom.i = sext i32 %sub.i to i64
+  %arrayidx.i = getelementptr inbounds [0 x i8]* %data.i.i, i64 0, i64 %idxprom.i
+  %tmp2 = load i8* %arrayidx.i, align 1
+  %conv2.i = sext i8 %tmp2 to i16
+  store i16 %conv2.i, i16* @b, align 2
+  %sub4.i = sub nsw i32 0, %conv.i
+  %conv5.i = zext i16 %conv2.i to i32
+  %cmp.i = icmp sgt i32 %conv5.i, %sub4.i
+  br i1 %cmp.i, label %if.then.i, label %fn2.exit
+
+if.then.i:                                        ; preds = %fn1.exit.i
+  %end.i = getelementptr inbounds %struct.dns_packet* %P, i64 0, i32 1
+  %tmp3 = load i32* %end.i, align 4
+  %sub7.i = add i32 %tmp3, 65535
+  %conv8.i = trunc i32 %sub7.i to i16
+  br label %fn2.exit
+
+fn2.exit:                                         ; preds = %if.then.i, %fn1.exit.i
+  %retval.0.i = phi i16 [ %conv8.i, %if.then.i ], [ undef, %fn1.exit.i ]
+  ret i16 %retval.0.i
+}
+
+; Check that we do not promote an extension if the non-wrapping flag does not
+; match the kind of the extension.
+; CHECK-LABEL: @noPromotionFlag
+; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = zext i32 [[ADD]] to i64
+; CHECK: inttoptr i64 [[PROMOTED]] to i8*
+; CHECK: ret
+define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) {
+  %add = add nsw i32 %arg1, %arg2 
+  %zextadd = zext i32 %add to i64
+  %base = inttoptr i64 %zextadd to i8*
+  %res = load i8* %base
+  ret i8 %res
+}
+
+; Check that we correctly promote both operands of the promotable add with zext.
+; CHECK-LABEL: @twoArgsPromotionZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg1 to i64
+; CHECK: [[ARG2ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg2 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], [[ARG2ZEXT]]
+; CHECK: inttoptr i64 [[PROMOTED]] to i8*
+; CHECK: ret
+define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) {
+  %add = add nuw i32 %arg1, %arg2 
+  %zextadd = zext i32 %add to i64
+  %base = inttoptr i64 %zextadd to i8*
+  %res = load i8* %base
+  ret i8 %res
+}
+
+; Check that we correctly promote constant arguments.
+; CHECK-LABEL: @oneArgPromotionNegativeCstZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 255
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, i8* %base) {
+  %add = add nuw i8 %arg1, -1 
+  %zextadd = zext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we are able to merge two zero extensions.
+; CHECK-LABEL: @oneArgPromotionZExtZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionZExtZExt(i8 %arg1, i8* %base) {
+  %zext = zext i8 %arg1 to i32
+  %add = add nuw i32 %zext, 1 
+  %zextadd = zext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we do not promote truncate when the dropped bits
+; are of a different kind.
+; CHECK-LABEL: @oneArgPromotionBlockTruncZExt
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32
+; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1TRUNC]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, i8* %base) {
+  %sextarg1 = sext i1 %arg1 to i32
+  %trunc = trunc i32 %sextarg1 to i8
+  %add = add nuw i8 %trunc, 1 
+  %zextadd = zext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we are able to promote truncate when we know all the bits
+; that are dropped.
+; CHECK-LABEL: @oneArgPromotionPassTruncZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i1 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, i8* %base) {
+  %sextarg1 = zext i1 %arg1 to i32
+  %trunc = trunc i32 %sextarg1 to i8
+  %add = add nuw i8 %trunc, 1 
+  %zextadd = zext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we do not promote sext with zext.
+; CHECK-LABEL: @oneArgPromotionBlockSExtZExt
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i8
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1SEXT]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionBlockSExtZExt(i1 %arg1, i8* %base) {
+  %sextarg1 = sext i1 %arg1 to i8
+  %add = add nuw i8 %sextarg1, 1 
+  %zextadd = zext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
diff --git a/test/CodeGen/X86/coff-comdat.ll b/test/CodeGen/X86/coff-comdat.ll
index bf27b2f..ac4546d 100644
--- a/test/CodeGen/X86/coff-comdat.ll
+++ b/test/CodeGen/X86/coff-comdat.ll
@@ -73,19 +73,19 @@ $vftable = comdat largest
 ; CHECK: .globl  @v8@0
 ; CHECK: .section        .text,"xr",discard,@f8@0
 ; CHECK: .globl  @f8@0
-; CHECK: .section        .bss,"bw",associative,_f1
+; CHECK: .section        .bss,"wb",associative,_f1
 ; CHECK: .globl  _v1
-; CHECK: .section        .bss,"bw",associative,_f2
+; CHECK: .section        .bss,"wb",associative,_f2
 ; CHECK: .globl  _v2
-; CHECK: .section        .bss,"bw",associative,_f3
+; CHECK: .section        .bss,"wb",associative,_f3
 ; CHECK: .globl  _v3
-; CHECK: .section        .bss,"bw",associative,_f4
+; CHECK: .section        .bss,"wb",associative,_f4
 ; CHECK: .globl  _v4
-; CHECK: .section        .bss,"bw",associative,_f5
+; CHECK: .section        .bss,"wb",associative,_f5
 ; CHECK: .globl  _v5
-; CHECK: .section        .bss,"bw",associative,_f6
+; CHECK: .section        .bss,"wb",associative,_f6
 ; CHECK: .globl  _v6
-; CHECK: .section        .bss,"bw",same_size,_f6
+; CHECK: .section        .bss,"wb",same_size,_f6
 ; CHECK: .globl  _f6
 ; CHECK: .section        .rdata,"rd",largest,_vftable
 ; CHECK: .globl  _vftable
diff --git a/test/CodeGen/X86/coff-comdat2.ll b/test/CodeGen/X86/coff-comdat2.ll
index 6744b5b..58bc04e 100644
--- a/test/CodeGen/X86/coff-comdat2.ll
+++ b/test/CodeGen/X86/coff-comdat2.ll
@@ -6,4 +6,4 @@ target triple = "i686-pc-windows-msvc"
 $foo = comdat largest
 @foo = global i32 0
 @bar = global i32 0, comdat $foo
-; CHECK: Associative COMDAT symbol 'foo' is not a key for it's COMDAT.
+; CHECK: Associative COMDAT symbol 'foo' is not a key for its COMDAT.
diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll
new file mode 100644
index 0000000..59a7a19
--- /dev/null
+++ b/test/CodeGen/X86/combine-and.ll
@@ -0,0 +1,164 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s
+;
+; Verify that the DAGCombiner is able to fold a vector AND into a blend
+; if one of the operands to the AND is a vector of all constants, and each
+; constant element is either zero or all-ones.
+
+
+define <4 x i32> @test1(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test1
+; CHECK: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test2(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test2
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test3(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test3
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test4(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 0, i32 0, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test4
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test5(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test5
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test6(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test6
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test7(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test7
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test8(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test8
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test9(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test9
+; CHECK: movq %xmm0, %xmm0
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test10(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test10
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test11(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test11
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test12(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 -1, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test12
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test13(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test13
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test14(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test14
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test15(<4 x i32> %A, <4 x i32> %B) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1>
+  %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 0>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test15
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test16(<4 x i32> %A, <4 x i32> %B) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0>
+  %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 -1>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test16
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test17(<4 x i32> %A, <4 x i32> %B) {
+  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1>
+  %2 = and <4 x i32> %B, <i32 -1, i32 0, i32 -1, i32 0>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test17
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
index ff807b9..9539eae 100644
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll
@@ -5,265 +5,293 @@
 ; instruction which performs a blend operation.
 
 define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
   %or = or <2 x i64> %shuf1, %shuf2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test1
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK-NOT: orps
-; CHECK: ret
 
 
 define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test2
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
 
 
 define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %or = or <2 x i64> %shuf1, %shuf2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test3
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK-NEXT: ret
 
 
 define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test4
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NOT: orps
-; CHECK: ret
 
 
 define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test5
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NEXT: ret
 
 
 define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test6
-; CHECK-NOT: xorps
-; CHECK: blendps $12
-; CHECK-NEXT: ret
 
 
 define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT:    retq
   %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
   %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
   %or = or <4 x i32> %and1, %and2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test7
-; CHECK-NOT: xorps
-; CHECK: blendps $12
-; CHECK-NEXT: ret
 
 
 define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT:    retq
   %and1 = and <2 x i64> %a, <i64 -1, i64 0>
   %and2 = and <2 x i64> %b, <i64 0, i64 -1>
   %or = or <2 x i64> %and1, %and2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test8
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK-NOT: orps
-; CHECK: ret
 
 
 define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
   %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
   %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
   %or = or <4 x i32> %and1, %and2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test9
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
 
 
 define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
   %and1 = and <2 x i64> %a, <i64 0, i64 -1>
   %and2 = and <2 x i64> %b, <i64 -1, i64 0>
   %or = or <2 x i64> %and1, %and2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test10
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK-NEXT: ret
 
 
 define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; CHECK-NEXT:    retq
   %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
   %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
   %or = or <4 x i32> %and1, %and2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test11
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NOT: orps
-; CHECK: ret
 
 
 define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; CHECK-NEXT:    retq
   %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
   %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
   %or = or <4 x i32> %and1, %and2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test12
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NEXT: ret
 
 
 ; Verify that the following test cases are folded into single shuffles.
 
 define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test13
-; CHECK-NOT: xorps
-; CHECK: shufps
-; CHECK-NEXT: ret
 
 
 define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test14:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
   %or = or <2 x i64> %shuf1, %shuf2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test14
-; CHECK-NOT: pslldq
-; CHECK-NOT: por
-; CHECK: punpcklqdq
-; CHECK-NEXT: ret
 
 
 define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test15
-; CHECK-NOT: xorps
-; CHECK: shufps
-; CHECK-NOT: shufps
-; CHECK-NOT: orps
-; CHECK: ret
 
 
 define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %or = or <2 x i64> %shuf1, %shuf2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test16
-; CHECK-NOT: pslldq
-; CHECK-NOT: por
-; CHECK: punpcklqdq
-; CHECK: ret
 
 
 ; Verify that the dag-combiner does not fold a OR of two shuffles into a single
 ; shuffle instruction when the shuffle indexes are not compatible.
 
 define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorps %xmm2, %xmm2
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,0]
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2]
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; CHECK-NEXT:    orps %xmm1, %xmm2
+; CHECK-NEXT:    movaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test17
-; CHECK: por
-; CHECK-NEXT: ret
 
 
 define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test18:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorps %xmm2, %xmm2
+; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; CHECK-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test18
-; CHECK: orps
-; CHECK: ret
 
 
 define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test19:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorps %xmm2, %xmm2
+; CHECK-NEXT:    xorps %xmm3, %xmm3
+; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,3]
+; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,2]
+; CHECK-NEXT:    orps %xmm3, %xmm2
+; CHECK-NEXT:    movaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test19
-; CHECK: por
-; CHECK-NEXT: ret
 
 
 define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test20:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    movq %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %or = or <2 x i64> %shuf1, %shuf2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test20
-; CHECK-NOT: xorps
-; CHECK: orps
-; CHECK-NEXT: movq
-; CHECK-NEXT: ret
 
 
 define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test21:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    movq %xmm0, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
   %or = or <2 x i64> %shuf1, %shuf2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test21
-; CHECK: por
-; CHECK-NEXT: pslldq
-; CHECK-NEXT: ret
 
+; Verify that the DAGCombiner doesn't crash in the attempt to check if a shuffle
+; with illegal type has a legal mask. Method 'isShuffleMaskLegal' only knows how to
+; handle legal vector value types.
+define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
+; CHECK-LABEL: test_crash:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
+  %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i8> %shuf1, %shuf2
+  ret <4 x i8> %or
+}
 
diff --git a/test/CodeGen/X86/combine-vec-shuffle-2.ll b/test/CodeGen/X86/combine-vec-shuffle-2.ll
deleted file mode 100644
index 7ab7f80..0000000
--- a/test/CodeGen/X86/combine-vec-shuffle-2.ll
+++ /dev/null
@@ -1,164 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
-
-; Check that DAGCombiner correctly folds the following pairs of shuffles
-; using the following rules:
-;  1. shuffle(shuffle(x, y), undef) -> x
-;  2. shuffle(shuffle(x, y), undef) -> y
-;  3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef)
-;  4. shuffle(shuffle(x, y), undef) -> shuffle(undef, y)
-;
-; Rules 3. and 4. are used only if the resulting shuffle mask is legal.
-
-define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test1
-; Mask: [3,0,0,1]
-; CHECK: pshufd $67
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test2
-; Mask: [2,0,0,3]
-; CHECK: pshufd $-62
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test3
-; Mask: [2,0,0,3]
-; CHECK: pshufd $-62
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test4
-; Mask: [0,0,0,1]
-; CHECK: pshufd $64
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test5
-; Mask: [1,1]
-; CHECK: movhlps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test6(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test6
-; Mask: [2,0,0,0]
-; CHECK: pshufd $2
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test7(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test7
-; Mask: [0,2,0,2]
-; CHECK: pshufd $-120
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test8
-; Mask: [1,0,3,0]
-; CHECK: pshufd $49
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test9(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test9
-; Mask: [1,3,0,2]
-; CHECK: pshufd $-115
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test10(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test10
-; Mask: [1,0,1,0]
-; CHECK: pshufd $17
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test11(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test11
-; Mask: [1,0,2,1]
-; CHECK: pshufd $97
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test12(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test12
-; Mask: [0,0,0,0]
-; CHECK: pshufd $0
-; CHECK-NEXT: ret
-
-
-; The following pair of shuffles is folded into vector %A.
-define <4 x i32> @test13(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test13
-; CHECK-NOT: pshufd
-; CHECK: ret
-
-
-; The following pair of shuffles is folded into vector %B.
-define <4 x i32> @test14(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test14
-; CHECK-NOT: pshufd
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/combine-vec-shuffle.ll b/test/CodeGen/X86/combine-vec-shuffle.ll
deleted file mode 100644
index 9e6ab89..0000000
--- a/test/CodeGen/X86/combine-vec-shuffle.ll
+++ /dev/null
@@ -1,253 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
-
-; Verify that the DAGCombiner correctly folds according to the following rules:
-
-; fold (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C)
-; fold (OR  (shuf (A, C), shuf (B, C)) -> shuf (OR  (A, B), C)
-; fold (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0)
-
-; fold (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B))
-; fold (OR  (shuf (C, A), shuf (C, B)) -> shuf (C, OR  (A, B))
-; fold (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B))
-
-
-
-define <4 x i32> @test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
-  %and = and <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %and
-}
-; CHECK-LABEL: test1
-; CHECK-NOT: pshufd
-; CHECK: pand
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-; CHECK-LABEL: test2
-; CHECK-NOT: pshufd
-; CHECK: por
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
-  %xor = xor <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %xor
-}
-; CHECK-LABEL: test3
-; CHECK-NOT: pshufd
-; CHECK: pxor
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
-  %and = and <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %and
-}
-; CHECK-LABEL: test4
-; CHECK-NOT: pshufd
-; CHECK: pand
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-; CHECK-LABEL: test5
-; CHECK-NOT: pshufd
-; CHECK: por
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
-  %xor = xor <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %xor
-}
-; CHECK-LABEL: test6
-; CHECK-NOT: pshufd
-; CHECK: pxor
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
-; are not performing a swizzle operations.
-
-define <4 x i32> @test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %and = and <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %and
-}
-; CHECK-LABEL: test1b
-; CHECK-NOT: blendps
-; CHECK: andps
-; CHECK-NEXT: blendps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-; CHECK-LABEL: test2b
-; CHECK-NOT: blendps
-; CHECK: orps
-; CHECK-NEXT: blendps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %xor = xor <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %xor
-}
-; CHECK-LABEL: test3b
-; CHECK-NOT: blendps
-; CHECK: xorps
-; CHECK-NEXT: xorps
-; CHECK-NEXT: blendps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %and = and <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %and
-}
-; CHECK-LABEL: test4b
-; CHECK-NOT: blendps
-; CHECK: andps
-; CHECK-NEXT: blendps
-; CHECK: ret
-
-
-define <4 x i32> @test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-; CHECK-LABEL: test5b
-; CHECK-NOT: blendps
-; CHECK: orps
-; CHECK-NEXT: blendps
-; CHECK: ret
-
-
-define <4 x i32> @test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %xor = xor <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %xor
-}
-; CHECK-LABEL: test6b
-; CHECK-NOT: blendps
-; CHECK: xorps
-; CHECK-NEXT: xorps
-; CHECK-NEXT: blendps
-; CHECK: ret
-
-define <4 x i32> @test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %and = and <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %and
-}
-; CHECK-LABEL: test1c
-; CHECK-NOT: shufps
-; CHECK: andps
-; CHECK-NEXT: shufps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-; CHECK-LABEL: test2c
-; CHECK-NOT: shufps
-; CHECK: orps
-; CHECK-NEXT: shufps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %xor = xor <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %xor
-}
-; CHECK-LABEL: test3c
-; CHECK-NOT: shufps
-; CHECK: xorps
-; CHECK-NEXT: xorps
-; CHECK-NEXT: shufps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %and = and <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %and
-}
-; CHECK-LABEL: test4c
-; CHECK-NOT: shufps
-; CHECK: andps
-; CHECK-NEXT: shufps
-; CHECK: ret
-
-
-define <4 x i32> @test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-; CHECK-LABEL: test5c
-; CHECK-NOT: shufps
-; CHECK: orps
-; CHECK-NEXT: shufps
-; CHECK: ret
-
-
-define <4 x i32> @test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %xor = xor <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %xor
-}
-; CHECK-LABEL: test6c
-; CHECK-NOT: shufps
-; CHECK: xorps
-; CHECK-NEXT: xorps
-; CHECK-NEXT: shufps
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/commute-blend-avx2.ll b/test/CodeGen/X86/commute-blend-avx2.ll
new file mode 100644
index 0000000..d06c6da
--- /dev/null
+++ b/test/CodeGen/X86/commute-blend-avx2.ll
@@ -0,0 +1,89 @@
+; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=avx2 < %s | FileCheck %s
+
+define <8 x i16> @commute_fold_vpblendw_128(<8 x i16> %a, <8 x i16>* %b) #0 {
+  %1 = load <8 x i16>* %b
+  %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
+  ret <8 x i16> %2
+
+  ;LABEL:      commute_fold_vpblendw_128
+  ;CHECK:      vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+  ;CHECK-NEXT: retq
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <16 x i16> @commute_fold_vpblendw_256(<16 x i16> %a, <16 x i16>* %b) #0 {
+  %1 = load <16 x i16>* %b
+  %2 = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %1, <16 x i16> %a, i8 17)
+  ret <16 x i16> %2
+
+  ;LABEL:      commute_fold_vpblendw_256
+  ;CHECK:      vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15]
+  ;CHECK-NEXT: retq
+}
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone
+
+define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 {
+  %1 = load <4 x i32>* %b
+  %2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1)
+  ret <4 x i32> %2
+
+  ;LABEL:      commute_fold_vpblendd_128
+  ;CHECK:      vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+  ;CHECK-NEXT: retq
+}
+declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 {
+  %1 = load <8 x i32>* %b
+  %2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129)
+  ret <8 x i32> %2
+
+  ;LABEL:      commute_fold_vpblendd_256
+  ;CHECK:      vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7]
+  ;CHECK-NEXT: retq
+}
+declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
+
+define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 {
+  %1 = load <4 x float>* %b
+  %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3)
+  ret <4 x float> %2
+
+  ;LABEL:      commute_fold_vblendps_128
+  ;CHECK:      vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+  ;CHECK-NEXT: retq
+}
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 {
+  %1 = load <8 x float>* %b
+  %2 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %1, <8 x float> %a, i8 7)
+  ret <8 x float> %2
+
+  ;LABEL:      commute_fold_vblendps_256
+  ;CHECK:      vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7]
+  ;CHECK-NEXT: retq
+}
+declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 {
+  %1 = load <2 x double>* %b
+  %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
+  ret <2 x double> %2
+
+  ;LABEL:      commute_fold_vblendpd_128
+  ;CHECK:      vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+  ;CHECK-NEXT: retq
+}
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 {
+  %1 = load <4 x double>* %b
+  %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7)
+  ret <4 x double> %2
+
+  ;LABEL:      commute_fold_vblendpd_256
+  ;CHECK:      vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3]
+  ;CHECK-NEXT: retq
+}
+declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/commute-blend-sse41.ll b/test/CodeGen/X86/commute-blend-sse41.ll
new file mode 100644
index 0000000..59fef8c
--- /dev/null
+++ b/test/CodeGen/X86/commute-blend-sse41.ll
@@ -0,0 +1,34 @@
+; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=corei7 < %s | FileCheck %s
+
+define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) #0 {
+  %1 = load <8 x i16>* %b
+  %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
+  ret <8 x i16> %2
+
+  ;LABEL:      commute_fold_pblendw
+  ;CHECK:      pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+  ;CHECK-NEXT: retq
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <4 x float> @commute_fold_blendps(<4 x float> %a, <4 x float>* %b) #0 {
+  %1 = load <4 x float>* %b
+  %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3)
+  ret <4 x float> %2
+
+  ;LABEL:      commute_fold_blendps
+  ;CHECK:      blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+  ;CHECK-NEXT: retq
+}
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 {
+  %1 = load <2 x double>* %b
+  %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
+  ret <2 x double> %2
+
+  ;LABEL:      commute_fold_vblendpd
+  ;CHECK:      blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+  ;CHECK-NEXT: retq
+}
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/commuted-blend-mask.ll b/test/CodeGen/X86/commuted-blend-mask.ll
new file mode 100644
index 0000000..e6322cb
--- /dev/null
+++ b/test/CodeGen/X86/commuted-blend-mask.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s
+
+; When commuting the operands of a SSE blend, make sure that the resulting blend
+; mask can be encoded as a imm8.
+; Before, when commuting the operands to the shuffle in function @test, the backend
+; produced the following assembly:
+;   pblendw $4294967103, %xmm1, %xmm0
+
+define <4 x i32> @test(<4 x i32> %a, <4 x i32> %b) {
+  ;CHECK: pblendw $63, %xmm1, %xmm0
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+  ret <4 x i32> %shuffle
+}
diff --git a/test/CodeGen/X86/constant-pool-remat-0.ll b/test/CodeGen/X86/constant-pool-remat-0.ll
index 4a01108..e42a87c 100644
--- a/test/CodeGen/X86/constant-pool-remat-0.ll
+++ b/test/CodeGen/X86/constant-pool-remat-0.ll
@@ -1,7 +1,7 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -mtriple=x86_64-linux   | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-linux -regalloc=greedy | FileCheck %s
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-linux -mattr=+sse2 | FileCheck %s
 ; CHECK:     LCPI
 ; CHECK:     LCPI
 ; CHECK:     LCPI
diff --git a/test/CodeGen/X86/constant-pool-sharing.ll b/test/CodeGen/X86/constant-pool-sharing.ll
index 26318dd..3682165 100644
--- a/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/test/CodeGen/X86/constant-pool-sharing.ll
@@ -1,12 +1,13 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s --check-prefix=COMMON --check-prefix=LINUX
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s --check-prefix=COMMON --check-prefix=MSVC
 
 ; llc should share constant pool entries between this integer vector
 ; and this floating-point vector since they have the same encoding.
 
-; CHECK:  LCPI0_0(%rip), %xmm0
-; CHECK:  movaps        %xmm0, ({{%rdi|%rcx}})
-; CHECK:  movaps        %xmm0, ({{%rsi|%rdx}})
+; LINUX:   LCPI0_0(%rip), %xmm0
+; MSVC:    __xmm@40000000400000004000000040000000(%rip), %xmm0
+; COMMON:  movaps        %xmm0, ({{%rdi|%rcx}})
+; COMMON:  movaps        %xmm0, ({{%rsi|%rdx}})
 
 define void @foo(<4 x i32>* %p, <4 x float>* %q, i1 %t) nounwind {
 entry:
diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll
index b578896..7160dcc 100644
--- a/test/CodeGen/X86/constructor.ll
+++ b/test/CodeGen/X86/constructor.ll
@@ -1,6 +1,8 @@
-; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=CTOR %s
-; RUN: llc -mtriple x86_64-pc-linux -use-init-array < %s | FileCheck --check-prefix=INIT-ARRAY %s
-@llvm.global_ctors = appending global [2 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @f }, { i32, void ()* } { i32 15, void ()* @g }]
+; RUN: llc -mtriple x86_64-pc-linux -use-ctors < %s | FileCheck --check-prefix=CTOR %s
+; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=INIT-ARRAY %s
+@llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}, { i32, void ()*, i8* } { i32 15, void ()* @g, i8* @v }]
+
+@v = weak_odr global i8 0
 
 define void @f() {
 entry:
@@ -12,14 +14,14 @@ entry:
   ret void
 }
 
-; CTOR:		.section	.ctors.65520,"aw",@progbits
+; CTOR:		.section	.ctors.65520,"aGw",@progbits,v,comdat
 ; CTOR-NEXT:	.align	8
 ; CTOR-NEXT:	.quad	g
 ; CTOR-NEXT:	.section	.ctors,"aw",@progbits
 ; CTOR-NEXT:	.align	8
 ; CTOR-NEXT:	.quad	f
 
-; INIT-ARRAY:		.section	.init_array.15,"aw",@init_array
+; INIT-ARRAY:		.section	.init_array.15,"aGw",@init_array,v,comdat
 ; INIT-ARRAY-NEXT:	.align	8
 ; INIT-ARRAY-NEXT:	.quad	g
 ; INIT-ARRAY-NEXT:	.section	.init_array,"aw",@init_array
diff --git a/test/CodeGen/X86/cvt16.ll b/test/CodeGen/X86/cvt16.ll
index 951b5c3..4d920e2 100644
--- a/test/CodeGen/X86/cvt16.ll
+++ b/test/CodeGen/X86/cvt16.ll
@@ -21,7 +21,7 @@
 
 
 define void @test1(float %src, i16* %dest) {
-  %1 = tail call i16 @llvm.convert.to.fp16(float %src)
+  %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src)
   store i16 %1, i16* %dest, align 2
   ret void
 }
@@ -34,7 +34,7 @@ define void @test1(float %src, i16* %dest) {
 
 define float @test2(i16* nocapture %src) {
   %1 = load i16* %src, align 2
-  %2 = tail call float @llvm.convert.from.fp16(i16 %1)
+  %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1)
   ret float %2
 }
 ; CHECK-LABEL: test2:
@@ -45,8 +45,8 @@ define float @test2(i16* nocapture %src) {
 
 
 define float @test3(float %src) nounwind uwtable readnone {
-  %1 = tail call i16 @llvm.convert.to.fp16(float %src)
-  %2 = tail call float @llvm.convert.from.fp16(i16 %1)
+  %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src)
+  %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1)
   ret float %2
 }
 
@@ -59,6 +59,31 @@ define float @test3(float %src) nounwind uwtable readnone {
 ; F16C-NEXT: vcvtph2ps
 ; F16C: ret
 
-declare float @llvm.convert.from.fp16(i16) nounwind readnone
-declare i16 @llvm.convert.to.fp16(float) nounwind readnone
+define double @test4(i16* nocapture %src) {
+  %1 = load i16* %src, align 2
+  %2 = tail call double @llvm.convert.from.fp16.f64(i16 %1)
+  ret double %2
+}
+; CHECK-LABEL: test4:
+; LIBCALL: callq  __gnu_h2f_ieee
+; LIBCALL: cvtss2sd
+; SOFTFLOAT: callq  __gnu_h2f_ieee
+; SOFTFLOAT: callq __extendsfdf2
+; F16C: vcvtph2ps
+; F16C: vcvtss2sd
+; F16C: ret
+
+
+define i16 @test5(double %src) {
+  %val = tail call i16 @llvm.convert.to.fp16.f64(double %src)
+  ret i16 %val
+}
+; CHECK-LABEL: test5:
+; LIBCALL: jmp  __truncdfhf2
+; SOFTFLOAT: callq  __truncdfhf2
+; F16C: jmp __truncdfhf2
 
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index 4912213..d0791dc 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -52,48 +52,48 @@ define void @_Z3barii(i32 %param1, i32 %param2) #0 {
 entry:
   %var1 = alloca %struct.AAA3, align 1
   %var2 = alloca %struct.AAA3, align 1
-  tail call void @llvm.dbg.value(metadata !{i32 %param1}, i64 0, metadata !30), !dbg !47
-  tail call void @llvm.dbg.value(metadata !{i32 %param2}, i64 0, metadata !31), !dbg !47
-  tail call void @llvm.dbg.value(metadata !48, i64 0, metadata !32), !dbg !49
+  tail call void @llvm.dbg.value(metadata !{i32 %param1}, i64 0, metadata !30, metadata !{metadata !"0x102"}), !dbg !47
+  tail call void @llvm.dbg.value(metadata !{i32 %param2}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !47
+  tail call void @llvm.dbg.value(metadata !48, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !49
   %tobool = icmp eq i32 %param2, 0, !dbg !50
   br i1 %tobool, label %if.end, label %if.then, !dbg !50
 
 if.then:                                          ; preds = %entry
   %call = tail call i8* @_Z5i2stri(i32 %param2), !dbg !52
-  tail call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !32), !dbg !49
+  tail call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !49
   br label %if.end, !dbg !54
 
 if.end:                                           ; preds = %entry, %if.then
-  tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33), !dbg !55
-  tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !56), !dbg !57
-  tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59), !dbg !60
+  tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33, metadata !{metadata !"0x102"}), !dbg !55
+  tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !56, metadata !{metadata !"0x102"}), !dbg !57
+  tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59, metadata !{metadata !"0x102"}), !dbg !60
   %arraydecay.i = getelementptr inbounds %struct.AAA3* %var1, i64 0, i32 0, i64 0, !dbg !61
   call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !61
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34), !dbg !63
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !64), !dbg !65
-  call void @llvm.dbg.value(metadata !58, i64 0, metadata !66), !dbg !67
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34, metadata !{metadata !"0x102"}), !dbg !63
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !64, metadata !{metadata !"0x102"}), !dbg !65
+  call void @llvm.dbg.value(metadata !58, i64 0, metadata !66, metadata !{metadata !"0x102"}), !dbg !67
   %arraydecay.i5 = getelementptr inbounds %struct.AAA3* %var2, i64 0, i32 0, i64 0, !dbg !68
   call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !68
   %tobool1 = icmp eq i32 %param1, 0, !dbg !69
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34), !dbg !63
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34, metadata !{metadata !"0x102"}), !dbg !63
   br i1 %tobool1, label %if.else, label %if.then2, !dbg !69
 
 if.then2:                                         ; preds = %if.end
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !71), !dbg !73
-  call void @llvm.dbg.value(metadata !74, i64 0, metadata !75), !dbg !76
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !71, metadata !{metadata !"0x102"}), !dbg !73
+  call void @llvm.dbg.value(metadata !74, i64 0, metadata !75, metadata !{metadata !"0x102"}), !dbg !76
   call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)), !dbg !76
   br label %if.end3, !dbg !72
 
 if.else:                                          ; preds = %if.end
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !77), !dbg !79
-  call void @llvm.dbg.value(metadata !80, i64 0, metadata !81), !dbg !82
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !77, metadata !{metadata !"0x102"}), !dbg !79
+  call void @llvm.dbg.value(metadata !80, i64 0, metadata !81, metadata !{metadata !"0x102"}), !dbg !82
   call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)), !dbg !82
   br label %if.end3
 
 if.end3:                                          ; preds = %if.else, %if.then2
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33), !dbg !55
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !83), !dbg !85
-  call void @llvm.dbg.value(metadata !58, i64 0, metadata !86), !dbg !87
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33, metadata !{metadata !"0x102"}), !dbg !55
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !83, metadata !{metadata !"0x102"}), !dbg !85
+  call void @llvm.dbg.value(metadata !58, i64 0, metadata !86, metadata !{metadata !"0x102"}), !dbg !87
   call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !87
   ret void, !dbg !88
 }
@@ -103,7 +103,7 @@ declare i8* @_Z5i2stri(i32) #1
 declare void @_Z3fooPcjPKc(i8*, i32, i8*) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -113,92 +113,92 @@ attributes #2 = { nounwind readnone }
 !llvm.module.flags = !{!44, !45}
 !llvm.ident = !{!46}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !23, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !23, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"dbg-changes-codegen-branch-folding.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"AAA3", i32 4, i64 32, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_structure_type ] [AAA3] [line 4, size 32, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00AAA3\004\0032\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_structure_type ] [AAA3] [line 4, size 32, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !6, metadata !11, metadata !17, metadata !18}
-!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS4AAA3", metadata !"text", i32 8, i64 32, i64 8, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [text] [line 8, size 32, align 8, offset 0] [from ]
-!7 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 32, i64 8, i32 0, i32 0, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
-!8 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!6 = metadata !{metadata !"0xd\00text\008\0032\008\000\000", metadata !1, metadata !"_ZTS4AAA3", metadata !7} ; [ DW_TAG_member ] [text] [line 8, size 32, align 8, offset 0] [from ]
+!7 = metadata !{metadata !"0x1\00\000\0032\008\000\000", null, null, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
+!8 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
-!11 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"AAA3", metadata !"AAA3", metadata !"", i32 5, metadata !12, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 5} ; [ DW_TAG_subprogram ] [line 5] [AAA3]
-!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
+!11 = metadata !{metadata !"0x2e\00AAA3\00AAA3\00\005\000\000\000\006\00256\001\005", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [AAA3]
+!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{null, metadata !14, metadata !15}
-!14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4AAA3]
-!15 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!16 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
-!17 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator=", metadata !"operator=", metadata !"_ZN4AAA3aSEPKc", i32 6, metadata !12, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 6} ; [ DW_TAG_subprogram ] [line 6] [operator=]
-!18 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator const char *", metadata !"operator const char *", metadata !"_ZNK4AAA3cvPKcEv", i32 7, metadata !19, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 7} ; [ DW_TAG_subprogram ] [line 7] [operator const char *]
-!19 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4AAA3]
+!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!16 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
+!17 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\000\000\006\00256\001\006", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 6] [operator=]
+!18 = metadata !{metadata !"0x2e\00operator const char *\00operator const char *\00_ZNK4AAA3cvPKcEv\007\000\000\000\006\00256\001\007", metadata !1, metadata !"_ZTS4AAA3", metadata !19, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [operator const char *]
+!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !20 = metadata !{metadata !15, metadata !21}
-!21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
-!22 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS4AAA3"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS4AAA3]
+!21 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
+!22 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS4AAA3]
 !23 = metadata !{metadata !24, metadata !35, metadata !40}
-!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"bar", metadata !"bar", metadata !"_Z3barii", i32 11, metadata !26, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32, i32)* @_Z3barii, null, null, metadata !29, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
-!25 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
-!26 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barii\0011\000\001\000\006\00256\001\0011", metadata !1, metadata !25, metadata !26, null, void (i32, i32)* @_Z3barii, null, null, metadata !29} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
+!25 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!26 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !27 = metadata !{null, metadata !28, metadata !28}
-!28 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!28 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !29 = metadata !{metadata !30, metadata !31, metadata !32, metadata !33, metadata !34}
-!30 = metadata !{i32 786689, metadata !24, metadata !"param1", metadata !25, i32 16777227, metadata !28, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [param1] [line 11]
-!31 = metadata !{i32 786689, metadata !24, metadata !"param2", metadata !25, i32 33554443, metadata !28, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [param2] [line 11]
-!32 = metadata !{i32 786688, metadata !24, metadata !"temp", metadata !25, i32 12, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [temp] [line 12]
-!33 = metadata !{i32 786688, metadata !24, metadata !"var1", metadata !25, i32 17, metadata !"_ZTS4AAA3", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var1] [line 17]
-!34 = metadata !{i32 786688, metadata !24, metadata !"var2", metadata !25, i32 18, metadata !"_ZTS4AAA3", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var2] [line 18]
-!35 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator=", metadata !"operator=", metadata !"_ZN4AAA3aSEPKc", i32 6, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !17, metadata !36, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [operator=]
+!30 = metadata !{metadata !"0x101\00param1\0016777227\000", metadata !24, metadata !25, metadata !28} ; [ DW_TAG_arg_variable ] [param1] [line 11]
+!31 = metadata !{metadata !"0x101\00param2\0033554443\000", metadata !24, metadata !25, metadata !28} ; [ DW_TAG_arg_variable ] [param2] [line 11]
+!32 = metadata !{metadata !"0x100\00temp\0012\000", metadata !24, metadata !25, metadata !15} ; [ DW_TAG_auto_variable ] [temp] [line 12]
+!33 = metadata !{metadata !"0x100\00var1\0017\000", metadata !24, metadata !25, metadata !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var1] [line 17]
+!34 = metadata !{metadata !"0x100\00var2\0018\000", metadata !24, metadata !25, metadata !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var2] [line 18]
+!35 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\001\000\006\00256\001\006", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, metadata !17, metadata !36} ; [ DW_TAG_subprogram ] [line 6] [def] [operator=]
 !36 = metadata !{metadata !37, metadata !39}
-!37 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!38 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4AAA3]
-!39 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [value] [line 6]
-!40 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"AAA3", metadata !"AAA3", metadata !"_ZN4AAA3C2EPKc", i32 5, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !11, metadata !41, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [AAA3]
+!37 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!38 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4AAA3]
+!39 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!40 = metadata !{metadata !"0x2e\00AAA3\00AAA3\00_ZN4AAA3C2EPKc\005\000\001\000\006\00256\001\005", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, metadata !11, metadata !41} ; [ DW_TAG_subprogram ] [line 5] [def] [AAA3]
 !41 = metadata !{metadata !42, metadata !43}
-!42 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!43 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!42 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!43 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15} ; [ DW_TAG_arg_variable ] [value] [line 5]
 !44 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!45 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!45 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !46 = metadata !{metadata !"clang version 3.5.0 "}
 !47 = metadata !{i32 11, i32 0, metadata !24, null}
 !48 = metadata !{i8* null}
 !49 = metadata !{i32 12, i32 0, metadata !24, null}
 !50 = metadata !{i32 14, i32 0, metadata !51, null}
-!51 = metadata !{i32 786443, metadata !1, metadata !24, i32 14, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!51 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
 !52 = metadata !{i32 15, i32 0, metadata !53, null}
-!53 = metadata !{i32 786443, metadata !1, metadata !51, i32 14, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!53 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !51} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
 !54 = metadata !{i32 16, i32 0, metadata !53, null}
 !55 = metadata !{i32 17, i32 0, metadata !24, null}
-!56 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !55} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!56 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38, metadata !55} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !57 = metadata !{i32 0, i32 0, metadata !40, metadata !55}
 !58 = metadata !{i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)}
-!59 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, metadata !55} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!59 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15, metadata !55} ; [ DW_TAG_arg_variable ] [value] [line 5]
 !60 = metadata !{i32 5, i32 0, metadata !40, metadata !55}
 !61 = metadata !{i32 5, i32 0, metadata !62, metadata !55}
-!62 = metadata !{i32 786443, metadata !1, metadata !40, i32 5, i32 0, i32 0, i32 3} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!62 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !40} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
 !63 = metadata !{i32 18, i32 0, metadata !24, null}
-!64 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !63} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!64 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38, metadata !63} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !65 = metadata !{i32 0, i32 0, metadata !40, metadata !63}
-!66 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, metadata !63} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!66 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15, metadata !63} ; [ DW_TAG_arg_variable ] [value] [line 5]
 !67 = metadata !{i32 5, i32 0, metadata !40, metadata !63}
 !68 = metadata !{i32 5, i32 0, metadata !62, metadata !63}
 !69 = metadata !{i32 20, i32 0, metadata !70, null}
-!70 = metadata !{i32 786443, metadata !1, metadata !24, i32 20, i32 0, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
-!71 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !72} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!70 = metadata !{metadata !"0xb\0020\000\000", metadata !1, metadata !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!71 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !72} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !72 = metadata !{i32 21, i32 0, metadata !70, null}
 !73 = metadata !{i32 0, i32 0, metadata !35, metadata !72}
 !74 = metadata !{i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)}
-!75 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !72} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!75 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !72} ; [ DW_TAG_arg_variable ] [value] [line 6]
 !76 = metadata !{i32 6, i32 0, metadata !35, metadata !72}
-!77 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !78} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!77 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !78} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !78 = metadata !{i32 23, i32 0, metadata !70, null}
 !79 = metadata !{i32 0, i32 0, metadata !35, metadata !78}
 !80 = metadata !{i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)}
-!81 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !78} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!81 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !78} ; [ DW_TAG_arg_variable ] [value] [line 6]
 !82 = metadata !{i32 6, i32 0, metadata !35, metadata !78}
-!83 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !84} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!83 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !84} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !84 = metadata !{i32 24, i32 0, metadata !24, null}
 !85 = metadata !{i32 0, i32 0, metadata !35, metadata !84}
-!86 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !84} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!86 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !84} ; [ DW_TAG_arg_variable ] [value] [line 6]
 !87 = metadata !{i32 6, i32 0, metadata !35, metadata !84}
 !88 = metadata !{i32 25, i32 0, metadata !24, null}
diff --git a/test/CodeGen/X86/dbg-changes-codegen.ll b/test/CodeGen/X86/dbg-changes-codegen.ll
index 0b17c45..aae95e8 100644
--- a/test/CodeGen/X86/dbg-changes-codegen.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -44,7 +44,7 @@
 define zeroext i1 @_ZN3Foo3batEv(%struct.Foo* %this) #0 align 2 {
 entry:
   %0 = load %struct.Foo** @pfoo, align 8
-  tail call void @llvm.dbg.value(metadata !{%struct.Foo* %0}, i64 0, metadata !62)
+  tail call void @llvm.dbg.value(metadata !{%struct.Foo* %0}, i64 0, metadata !62, metadata !{metadata !"0x102"})
   %cmp.i = icmp eq %struct.Foo* %0, %this
   ret i1 %cmp.i
 }
@@ -53,7 +53,7 @@ entry:
 define void @_Z3bazv() #1 {
 entry:
   %0 = load %struct.Wibble** @wibble1, align 8
-  tail call void @llvm.dbg.value(metadata !64, i64 0, metadata !65)
+  tail call void @llvm.dbg.value(metadata !64, i64 0, metadata !65, metadata !{metadata !"0x102"})
   %1 = load %struct.Wibble** @wibble2, align 8
   %cmp.i = icmp ugt %struct.Wibble* %1, %0
   br i1 %cmp.i, label %if.then.i, label %_ZN7Flibble3barEP6Wibble.exit
@@ -69,15 +69,15 @@ _ZN7Flibble3barEP6Wibble.exit:                    ; preds = %entry, %if.then.i
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 
 
-!17 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo]
-!45 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble]
-!62 = metadata !{i32 786689, null, metadata !"arg", null, i32 33554436, metadata !17, i32 0, null} ; [ DW_TAG_arg_variable ] [arg] [line 4]
+!17 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo]
+!45 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble]
+!62 = metadata !{metadata !"0x101\00arg\0033554436\000", null, null, metadata !17} ; [ DW_TAG_arg_variable ] [arg] [line 4]
 !64 = metadata !{%struct.Flibble* undef}
-!65 = metadata !{i32 786689, null, metadata !"this", null, i32 16777229, metadata !45, i32 1088, null} ; [ DW_TAG_arg_variable ] [this] [line 13]
+!65 = metadata !{metadata !"0x101\00this\0016777229\001088", null, null, metadata !45} ; [ DW_TAG_arg_variable ] [this] [line 13]
diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll
index 21225e3..fd07a3f 100644
--- a/test/CodeGen/X86/divide-by-constant.ll
+++ b/test/CodeGen/X86/divide-by-constant.ll
@@ -31,6 +31,7 @@ entry:
 ; CHECK-LABEL: test3:
 ; CHECK: movzbl  8(%esp), %eax
 ; CHECK-NEXT: imull	$171, %eax
+; CHECK-NEXT: andl $65024, %eax
 ; CHECK-NEXT: shrl	$9, %eax
 ; CHECK-NEXT: ret
 }
@@ -56,9 +57,10 @@ entry:
   %div = sdiv i16 %x, 10
   ret i16 %div
 ; CHECK-LABEL: test6:
-; CHECK: imull $26215, %eax, %ecx
-; CHECK: sarl $18, %ecx
-; CHECK: shrl $15, %eax
+; CHECK: imull $26215, %eax
+; CHECK: movl %eax, %ecx
+; CHECK: shrl $31, %ecx
+; CHECK: sarl $18, %eax
 }
 
 define i32 @test7(i32 %x) nounwind {
diff --git a/test/CodeGen/X86/divrem8_ext.ll b/test/CodeGen/X86/divrem8_ext.ll
new file mode 100644
index 0000000..ec367c8
--- /dev/null
+++ b/test/CodeGen/X86/divrem8_ext.ll
@@ -0,0 +1,100 @@
+; RUN: llc -march=x86-64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-64
+; RUN: llc -march=x86    < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-32
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+define zeroext i8 @test_udivrem_zext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_udivrem_zext_ah
+; CHECK:   divb
+; CHECK:   movzbl %ah, [[REG_REM:%[a-z0-9]+]]
+; CHECK:   movb   %al, ([[REG_ZPTR:%[a-z0-9]+]])
+; CHECK:   movl   [[REG_REM]], %eax
+; CHECK:   ret
+  %div = udiv i8 %x, %y
+  store i8 %div, i8* @z
+  %1 = urem i8 %x, %y
+  ret i8 %1
+}
+
+define zeroext i8 @test_urem_zext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_zext_ah
+; CHECK:   divb
+; CHECK:   movzbl %ah, %eax
+; CHECK:   ret
+  %1 = urem i8 %x, %y
+  ret i8 %1
+}
+
+define i8 @test_urem_noext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_noext_ah
+; CHECK:   divb   [[REG_X:%[a-z0-9]+]]
+; CHECK:   movzbl %ah, %eax
+; CHECK:   addb   [[REG_X]], %al
+; CHECK:   ret
+  %1 = urem i8 %x, %y
+  %2 = add i8 %1, %y
+  ret i8 %2
+}
+
+define i64 @test_urem_zext64_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_zext64_ah
+; CHECK:    divb
+; CHECK:    movzbl %ah, %eax
+; CHECK-32: xorl %edx, %edx
+; CHECK:    ret
+  %1 = urem i8 %x, %y
+  %2 = zext i8 %1 to i64
+  ret i64 %2
+}
+
+define signext i8 @test_sdivrem_sext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_sdivrem_sext_ah
+; CHECK:   cbtw
+; CHECK:   idivb
+; CHECK:   movsbl %ah, [[REG_REM:%[a-z0-9]+]]
+; CHECK:   movb   %al, ([[REG_ZPTR]])
+; CHECK:   movl   [[REG_REM]], %eax
+; CHECK:   ret
+  %div = sdiv i8 %x, %y
+  store i8 %div, i8* @z
+  %1 = srem i8 %x, %y
+  ret i8 %1
+}
+
+define signext i8 @test_srem_sext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_sext_ah
+; CHECK:   cbtw
+; CHECK:   idivb
+; CHECK:   movsbl %ah, %eax
+; CHECK:   ret
+  %1 = srem i8 %x, %y
+  ret i8 %1
+}
+
+define i8 @test_srem_noext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_noext_ah
+; CHECK:   cbtw
+; CHECK:   idivb [[REG_X:%[a-z0-9]+]]
+; CHECK:   movsbl %ah, %eax
+; CHECK:   addb   [[REG_X]], %al
+; CHECK:   ret
+  %1 = srem i8 %x, %y
+  %2 = add i8 %1, %y
+  ret i8 %2
+}
+
+define i64 @test_srem_sext64_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_sext64_ah
+; CHECK:    cbtw
+; CHECK:    idivb
+; CHECK:    movsbl %ah, %eax
+; CHECK-32: movl %eax, %edx
+; CHECK-32: sarl $31, %edx
+; CHECK-64: movsbq %al, %rax
+; CHECK:    ret
+  %1 = srem i8 %x, %y
+  %2 = sext i8 %1 to i64
+  ret i64 %2
+}
+
+@z = external global i8
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index 0d5afa1..c673f5d 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -70,7 +70,7 @@ define weak_odr dllexport void @weak1() {
 
 ; CHECK: .weak weak_alias
 ; CHECK: weak_alias = f1
-@weak_alias = dllexport alias weak_odr void()* @f1
+@weak_alias = weak_odr dllexport alias void()* @f1
 
 @blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
 @blob_alias = dllexport alias bitcast ([6 x i8]* @blob to i32 ()*)
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index e2c3f13..5035aa1 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -89,7 +89,7 @@ define weak_odr dllexport void @weak1() {
 
 ; CHECK: .weak _weak_alias
 ; CHECK: _weak_alias = _f1
-@weak_alias = dllexport alias weak_odr void()* @f1
+@weak_alias = weak_odr dllexport alias void()* @f1
 
 
 ; CHECK: .section .drectve
diff --git a/test/CodeGen/X86/dllimport-x86_64.ll b/test/CodeGen/X86/dllimport-x86_64.ll
index 666409f..839bca4 100644
--- a/test/CodeGen/X86/dllimport-x86_64.ll
+++ b/test/CodeGen/X86/dllimport-x86_64.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -mtriple x86_64-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST
 ; PR6275
 ;
-; RUN: opt -mtriple x86_64-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT
+; RUN: opt -mtriple x86_64-pc-win32 -O3 -S < %s | FileCheck %s -check-prefix=OPT
 
 @Var1 = external dllimport global i32
 @Var2 = available_externally dllimport unnamed_addr constant i32 1
diff --git a/test/CodeGen/X86/dllimport.ll b/test/CodeGen/X86/dllimport.ll
index 695bfce..231ad65 100644
--- a/test/CodeGen/X86/dllimport.ll
+++ b/test/CodeGen/X86/dllimport.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -mtriple i386-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST
 ; PR6275
 ;
-; RUN: opt -mtriple i386-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT
+; RUN: opt -mtriple i386-pc-win32 -O3 -S < %s | FileCheck %s -check-prefix=OPT
 
 @Var1 = external dllimport global i32
 @Var2 = available_externally dllimport unnamed_addr constant i32 1
diff --git a/test/CodeGen/X86/dont-trunc-store-double-to-float.ll b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll
new file mode 100644
index 0000000..24d9533
--- /dev/null
+++ b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=x86 < %s | FileCheck %s
+
+; CHECK-LABEL: @bar
+; CHECK: movl $1074339512,
+; CHECK: movl $1374389535,
+; CHECK: movl $1078523331,
+define void @bar() unnamed_addr {
+entry-block:
+  %a = alloca double
+  %b = alloca float
+
+  store double 3.140000e+00, double* %a
+  %0 = load double* %a
+
+  %1 = fptrunc double %0 to float
+
+  store float %1, float* %b
+
+  ret void
+}
diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index c8d7527..872f7fa 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll
@@ -7,15 +7,15 @@ target triple = "x86_64-unknown-linux-gnu"
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!5}
 
-!0 = metadata !{i32 720913, metadata !4, i32 12, metadata !"clang version 3.1 (trunk 143523)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 143523)\001\00\000\00\000", metadata !4, metadata !2, metadata !7, metadata !2, metadata !2, null} ; [ DW_TAG_compile_unit ]
 !2 = metadata !{}
-!3 = metadata !{i32 786473, metadata !4} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x29", metadata !4} ; [ DW_TAG_file_type ]
 !4 = metadata !{metadata !"empty.c", metadata !"/home/nlewycky"}
-!6 = metadata !{i32 786451, metadata !4, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !4, null, null, metadata !2, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
 !7 = metadata !{metadata !6}
 
 ; The important part of the following check is that dir = #0.
 ;                        Dir  Mod Time   File Len   File Name
 ;                        ---- ---------- ---------- ---------------------------
 ; CHECK: file_names[  1]    0 0x00000000 0x00000000 empty.c
-!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/dynamic-alloca-lifetime.ll b/test/CodeGen/X86/dynamic-alloca-lifetime.ll
new file mode 100644
index 0000000..f019bed
--- /dev/null
+++ b/test/CodeGen/X86/dynamic-alloca-lifetime.ll
@@ -0,0 +1,44 @@
+; RUN: llc -no-stack-coloring=false < %s | FileCheck %s
+
+; This test crashed in PEI because the stack protector was dead.
+; This was due to it being colored, which was in turn due to incorrect
+; lifetimes being applied to the stack protector frame index.
+
+; CHECK: stack_chk_guard
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.10.0"
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: ssp
+define void @foo(i1 %cond1, i1 %cond2) #1 {
+entry:
+  %bitmapBuffer = alloca [8192 x i8], align 1
+  br i1 %cond1, label %end1, label %bb1
+
+bb1:
+  %bitmapBuffer229 = alloca [8192 x i8], align 1
+  br i1 %cond2, label %end1, label %if.else130
+
+end1:
+  ret void
+
+if.else130:                                       ; preds = %bb1
+  %tmp = getelementptr inbounds [8192 x i8]* %bitmapBuffer, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 8192, i8* %tmp) #0
+  call void @llvm.lifetime.end(i64 8192, i8* %tmp) #0
+  %tmp25 = getelementptr inbounds [8192 x i8]* %bitmapBuffer229, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 8192, i8* %tmp25) #0
+  call void @llvm.lifetime.end(i64 8192, i8* %tmp25) #0
+  br label %end1
+}
+
+declare void @bar()
+
+attributes #0 = { nounwind }
+attributes #1 = { ssp }
+\ No newline at end of file
diff --git a/test/CodeGen/X86/empty-functions.ll b/test/CodeGen/X86/empty-functions.ll
index ac5174d..4234968 100644
--- a/test/CodeGen/X86/empty-functions.ll
+++ b/test/CodeGen/X86/empty-functions.ll
@@ -1,10 +1,14 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck -check-prefix=CHECK-NO-FP %s
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=LINUX-NO-FP %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -disable-fp-elim | FileCheck -check-prefix=LINUX-FP %s
 
 define void @func() {
 entry:
   unreachable
 }
+
+; MachO cannot handle an empty function.
 ; CHECK-NO-FP:     _func:
 ; CHECK-NO-FP-NEXT: .cfi_startproc
 ; CHECK-NO-FP:     nop
@@ -21,5 +25,30 @@ entry:
 ; CHECK-FP-NEXT: movq %rsp, %rbp
 ; CHECK-FP-NEXT: :
 ; CHECK-FP-NEXT: .cfi_def_cfa_register %rbp
-; CHECK-FP-NEXT: nop
 ; CHECK-FP-NEXT: .cfi_endproc
+
+; An empty function is perfectly fine on ELF.
+; LINUX-NO-FP: func:
+; LINUX-NO-FP-NEXT: .cfi_startproc
+; LINUX-NO-FP-NEXT: {{^}}#
+; LINUX-NO-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-NO-FP-NEXT: .size   func, .L{{.*}}-func
+; LINUX-NO-FP-NEXT: .cfi_endproc
+
+; A cfi directive can point to the end of a function. It (and in fact the
+; entire body) could be optimized out because of the unreachable, but we
+; don't do it right now.
+; LINUX-FP: func:
+; LINUX-FP-NEXT: .cfi_startproc
+; LINUX-FP-NEXT: {{^}}#
+; LINUX-FP-NEXT: pushq %rbp
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT:  .cfi_def_cfa_offset 16
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_offset %rbp, -16
+; LINUX-FP-NEXT: movq        %rsp, %rbp
+; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_def_cfa_register %rbp
+; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .size   func, .Ltmp3-func
+; LINUX-FP-NEXT: .cfi_endproc
diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll
index a18f751..ab92fe0 100644
--- a/test/CodeGen/X86/exedepsfix-broadcast.ll
+++ b/test/CodeGen/X86/exedepsfix-broadcast.ll
@@ -93,10 +93,11 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %
 
 
 ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg
-; ExeDepsFix works top down, thus it coalesces vmovlhps domain with
-; vandps and there is nothing more you can do to match vmaxpd.
-; CHECK: vmovlhps
-; CHECK: vandps
+; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with
+; vpand and there is nothing more you can do to match vmaxpd.
+; CHECK: vmovq
+; CHECK: vpbroadcastq
+; CHECK: vpand
 ; CHECK: vmaxpd
 ; CHECK: ret
 define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {
diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll
index cadc0fb..8647599 100644
--- a/test/CodeGen/X86/extractelement-load.ll
+++ b/test/CodeGen/X86/extractelement-load.ll
@@ -1,6 +1,8 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=yonah | FileCheck %s
 ; RUN: llc < %s -march=x86-64 -mattr=+sse2 -mcpu=core2 | FileCheck %s
 
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
 define i32 @t(<2 x i64>* %val) nounwind  {
 ; CHECK-LABEL: t:
 ; CHECK-NOT: movd
@@ -23,3 +25,40 @@ undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
   %y = extractelement <8 x i32> %Shuff68, i32 0
   ret i32 %y
 }
+
+; This case could easily end up inf-looping in the DAG combiner due to an
+; low alignment load of the vector which prevents us from reliably forming a
+; narrow load.
+; FIXME: It would be nice to detect whether the target has fast and legal
+; unaligned loads and use them here.
+define void @t3() {
+; CHECK-LABEL: t3:
+;
+; This movs the entire vector, shuffling the high double down. If we fixed the
+; FIXME above it would just move the high double directly.
+; CHECK: movupd
+; CHECK: shufpd
+; CHECK: movlpd
+
+bb:
+  %tmp13 = load <2 x double>* undef, align 1
+  %.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1
+  store double %.sroa.3.24.vec.extract, double* undef, align 8
+  unreachable
+}
+
+; Case where a load is unary shuffled, then bitcast (to a type with the same
+; number of elements) before extractelement.
+; This is testing for an assertion - the extraction was assuming that the undef
+; second shuffle operand was a post-bitcast type instead of a pre-bitcast type.
+define i64 @t4(<2 x double>* %a) {
+; CHECK-LABEL: t4:
+; CHECK: mov
+; CHECK: ret
+  %b = load <2 x double>* %a, align 16
+  %c = shufflevector <2 x double> %b, <2 x double> %b, <2 x i32> <i32 1, i32 0>
+  %d = bitcast <2 x double> %c to <2 x i64>
+  %e = extractelement <2 x i64> %d, i32 1
+  ret i64 %e
+}
+
diff --git a/test/CodeGen/X86/fast-isel-args-fail.ll b/test/CodeGen/X86/fast-isel-args-fail.ll
index 7467edd..7e783d2 100644
--- a/test/CodeGen/X86/fast-isel-args-fail.ll
+++ b/test/CodeGen/X86/fast-isel-args-fail.ll
@@ -1,7 +1,6 @@
 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-apple-darwin10
 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN32
 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win64 | FileCheck %s -check-prefix=WIN64
-; REQUIRES: asserts
 
 ; Previously, this would cause an assert.
 define i31 @t1(i31 %a, i31 %b, i31 %c) {
diff --git a/test/CodeGen/X86/fast-isel-cmp-branch3.ll b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
index a3f6851..0df782d 100644
--- a/test/CodeGen/X86/fast-isel-cmp-branch3.ll
+++ b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
@@ -351,7 +351,7 @@ bb1:
 define i32 @icmp_eq(i32 %x) {
 ; CHECK-LABEL: icmp_eq
 ; CHECK-NOT:   cmpl
-; CHECK:       movl $0, %eax
+; CHECK:       xorl %eax, %eax
   %1 = icmp eq i32 %x, %x
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -387,7 +387,7 @@ bb1:
 define i32 @icmp_uge(i32 %x) {
 ; CHECK-LABEL: icmp_uge
 ; CHECK-NOT:   cmpl
-; CHECK:       movl $0, %eax
+; CHECK:       xorl %eax, %eax
   %1 = icmp uge i32 %x, %x
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -411,7 +411,7 @@ bb1:
 define i32 @icmp_ule(i32 %x) {
 ; CHECK-LABEL: icmp_ule
 ; CHECK-NOT:   cmpl
-; CHECK:       movl $0, %eax
+; CHECK:       xorl %eax, %eax
   %1 = icmp ule i32 %x, %x
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -435,7 +435,7 @@ bb1:
 define i32 @icmp_sge(i32 %x) {
 ; CHECK-LABEL: icmp_sge
 ; CHECK-NOT:   cmpl
-; CHECK:       movl $0, %eax
+; CHECK:       xorl %eax, %eax
   %1 = icmp sge i32 %x, %x
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -459,7 +459,7 @@ bb1:
 define i32 @icmp_sle(i32 %x) {
 ; CHECK-LABEL: icmp_sle
 ; CHECK-NOT:   cmpl
-; CHECK:       movl $0, %eax
+; CHECK:       xorl %eax, %eax
   %1 = icmp sle i32 %x, %x
   br i1 %1, label %bb1, label %bb2
 bb2:
diff --git a/test/CodeGen/X86/fast-isel-constpool.ll b/test/CodeGen/X86/fast-isel-constpool.ll
index bbbaeb2..4e6f7c0 100644
--- a/test/CodeGen/X86/fast-isel-constpool.ll
+++ b/test/CodeGen/X86/fast-isel-constpool.ll
@@ -1,19 +1,23 @@
-; RUN: llc < %s -fast-isel | FileCheck %s
-; CHECK: LCPI0_0(%rip)
+; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=small < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=large < %s | FileCheck %s --check-prefix=LARGE
 
-; Make sure fast isel uses rip-relative addressing when required.
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-target triple = "x86_64-apple-darwin9.0"
+; Make sure fast isel uses rip-relative addressing for the small code model.
+define float @constpool_float(float %x) {
+; CHECK-LABEL: constpool_float
+; CHECK:       LCPI0_0(%rip)
 
-define i32 @f0(double %x) nounwind {
-entry:
-	%retval = alloca i32		; <i32*> [#uses=2]
-	%x.addr = alloca double		; <double*> [#uses=2]
-	store double %x, double* %x.addr
-	%tmp = load double* %x.addr		; <double> [#uses=1]
-	%cmp = fcmp olt double %tmp, 8.500000e-01		; <i1> [#uses=1]
-	%conv = zext i1 %cmp to i32		; <i32> [#uses=1]
-	store i32 %conv, i32* %retval
-	%0 = load i32* %retval		; <i32> [#uses=1]
-	ret i32 %0
+; LARGE-LABEL: constpool_float
+; LARGE:       movabsq  $LCPI0_0, %rax
+  %1 = fadd float %x, 16.50e+01
+  ret float %1
+}
+
+define double @constpool_double(double %x) nounwind {
+; CHECK-LABEL: constpool_double
+; CHECK:       LCPI1_0(%rip)
+
+; LARGE-LABEL: constpool_double
+; LARGE:       movabsq  $LCPI1_0, %rax
+  %1 = fadd double %x, 8.500000e-01
+  ret double %1
 }
diff --git a/test/CodeGen/X86/fast-isel-mem.ll b/test/CodeGen/X86/fast-isel-mem.ll
index cd2dc1d..eca1ae9 100644
--- a/test/CodeGen/X86/fast-isel-mem.ll
+++ b/test/CodeGen/X86/fast-isel-mem.ll
@@ -36,11 +36,11 @@ entry:
 	store i32 (...)** getelementptr ([4 x i32 (...)*]* @LotsStuff, i32 0, i32 2), i32 (...)*** null, align 4
 	ret void
 ; CHECK: _t:
-; CHECK:	movl	$0, %eax
+; CHECK:	xorl    %eax, %eax
 ; CHECK:	movl	L_LotsStuff$non_lazy_ptr, %ecx
 
 ; ATOM: _t:
 ; ATOM:         movl    L_LotsStuff$non_lazy_ptr, %e{{..}}
-; ATOM:         movl    $0, %e{{..}}
+; ATOM:         xorl    %e{{..}}, %e{{..}}
 
 }
diff --git a/test/CodeGen/X86/fast-isel-tls.ll b/test/CodeGen/X86/fast-isel-tls.ll
index f71abd2..686df43 100644
--- a/test/CodeGen/X86/fast-isel-tls.ll
+++ b/test/CodeGen/X86/fast-isel-tls.ll
@@ -13,7 +13,7 @@ entry:
 ; CHECK: leal	v@TLSGD
 ; CHECK: __tls_get_addr
 
-@alias = alias internal i32* @v
+@alias = internal alias i32* @v
 define i32 @f_alias() nounwind {
 entry:
           %t = load i32* @v
diff --git a/test/CodeGen/X86/fast-isel-x32.ll b/test/CodeGen/X86/fast-isel-x32.ll
new file mode 100644
index 0000000..d49a108
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-x32.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl -fast-isel -fast-isel-abort | FileCheck %s
+
+; Test that alloca addresses are materialized with the right size instruction.
+
+declare void @bar(i32* %arg)
+
+; CHECK-LABEL: @foo
+define void @foo() {
+  %a = alloca i32
+; CHECK: leal {{.*}}, %edi
+  call void @bar(i32* %a)
+  ret void
+}
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index f7d2750..3747d04 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -144,7 +144,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-LABEL: test12:
 ; CHECK: testb	$1,
 ; CHECK-NEXT: je L
-; CHECK-NEXT: movl $0, %edi
+; CHECK-NEXT: xorl %edi, %edi
 ; CHECK-NEXT: callq
 }
 
@@ -154,7 +154,7 @@ define void @test13() nounwind {
   call void @test13f(i1 0)
   ret void
 ; CHECK-LABEL: test13:
-; CHECK: movl $0, %edi
+; CHECK: xorl %edi, %edi
 ; CHECK-NEXT: callq
 }
 
@@ -194,12 +194,10 @@ define void @test16() nounwind {
   br label %block2
 
 block2:
-; CHECK: movabsq $1
-; CHECK: cvtsi2sdq {{.*}} %xmm0
+; CHECK: movsd LCP{{.*}}_{{.*}}(%rip), %xmm0
 ; CHECK: movb $1, %al
 ; CHECK: callq _test16callee
 
-; AVX: movabsq $1
 ; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
 ; AVX: movb $1, %al
 ; AVX: callq _test16callee
@@ -280,7 +278,7 @@ entry:
   call void @foo22(i32 3)
   ret void
 ; CHECK-LABEL: test22:
-; CHECK: movl	$0, %edi
+; CHECK: xorl	%edi, %edi
 ; CHECK: callq	_foo22
 ; CHECK: movl	$1, %edi
 ; CHECK: callq	_foo22
@@ -304,3 +302,13 @@ define void @test23(i8* noalias sret %result) {
 }
 
 declare i8* @foo23()
+
+declare void @takesi32ptr(i32* %arg)
+
+; CHECK-LABEL: allocamaterialize
+define void @allocamaterialize() {
+  %a = alloca i32
+; CHECK: leaq {{.*}}, %rdi
+  call void @takesi32ptr(i32* %a)
+  ret void
+}
diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index a212a7c..61e9b98 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll
@@ -60,3 +60,21 @@ entry:
 ; CHECK: addl $28
 }
 declare fastcc void @test4fastccsret(%struct.a* sret)
+
+
+; Check that fast-isel cleans up when it fails to lower a call instruction.
+define void @test5() {
+entry:
+  %call = call i32 @test5dllimport(i32 42)
+  ret void
+; CHECK-LABEL: test5:
+; Local value area is still there:
+; CHECK: movl $42, {{%[a-z]+}}
+; Fast-ISel's arg push is not here:
+; CHECK-NOT: movl $42, (%esp)
+; SDag-ISel's arg push:
+; CHECK: movl %esp, [[REGISTER:%[a-z]+]]
+; CHECK: movl $42, ([[REGISTER]])
+; CHECK: movl __imp__test5dllimport
+}
+declare dllimport i32 @test5dllimport(i32)
diff --git a/test/CodeGen/X86/fastmath-optnone.ll b/test/CodeGen/X86/fastmath-optnone.ll
new file mode 100644
index 0000000..0caadff
--- /dev/null
+++ b/test/CodeGen/X86/fastmath-optnone.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -mcpu=corei7 -march=x86-64 -mattr=+sse2 | FileCheck %s
+; Verify that floating-point operations inside 'optnone' functions
+; are not optimized even if unsafe-fp-math is set.
+
+define float @foo(float %x) #0 {
+entry:
+  %add = fadd fast float %x, %x
+  %add1 = fadd fast float %add, %x
+  ret float %add1
+}
+
+; CHECK-LABEL: @foo
+; CHECK-NOT: add
+; CHECK: mul
+; CHECK-NOT: add
+; CHECK: ret
+
+define float @fooWithOptnone(float %x) #1 {
+entry:
+  %add = fadd fast float %x, %x
+  %add1 = fadd fast float %add, %x
+  ret float %add1
+}
+
+; CHECK-LABEL: @fooWithOptnone
+; CHECK-NOT: mul
+; CHECK: add
+; CHECK-NOT: mul
+; CHECK: add
+; CHECK-NOT: mul
+; CHECK: ret
+
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { noinline optnone "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma-intrinsics-x86_64.ll
index 494cb28..aadd731 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma-intrinsics-x86_64.ll
@@ -1,316 +1,278 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK
 
 ; VFMADD
 define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfmaddss
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
-  ; CHECK: vfmaddss (%{{.*}})
-  %x = load float *%a2
-  %y = insertelement <4 x float> undef, float %x, i32 0
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
-  ; CHECK: vfmaddss %{{.*}}, (%{{.*}})
-  %x = load float *%a1
-  %y = insertelement <4 x float> undef, float %x, i32 0
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfmaddss
+  ; CHECK-FMA: vfmadd213ss
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
   ret < 4 x float > %res
 }
 declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
 define < 2 x double > @test_x86_fma_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfmaddsd
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
-  ; CHECK: vfmaddsd (%{{.*}})
-  %x = load double *%a2
-  %y = insertelement <2 x double> undef, double %x, i32 0
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
-  ; CHECK: vfmaddsd %{{.*}}, (%{{.*}})
-  %x = load double *%a1
-  %y = insertelement <2 x double> undef, double %x, i32 0
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfmaddsd
+  ; CHECK-FMA: vfmadd213sd
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
   ret < 2 x double > %res
 }
 declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
 define < 4 x float > @test_x86_fma_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfmaddps
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
-  ; CHECK: vfmaddps (%{{.*}})
-  %x = load <4 x float>* %a2
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
-  ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
-  %x = load <4 x float>* %a1
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfmaddps
+  ; CHECK-FMA: vfmadd213ps
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
   ret < 4 x float > %res
 }
 declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-; To test execution dependency
-define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) {
-  ; CHECK: vmovaps
-  ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
-  %x = load <4 x float>* %a0
-  %y = load <4 x float>* %a1
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-
 define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfmaddpd
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
-  ; CHECK: vfmaddpd (%{{.*}})
-  %x = load <2 x double>* %a2
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
-  ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
-  %x = load <2 x double>* %a1
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfmaddpd
+  ; CHECK-FMA: vfmadd213pd
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
   ret < 2 x double > %res
 }
 declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-; To test execution dependency
-define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) {
-  ; CHECK: vmovapd
-  ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
-  %x = load <2 x double>* %a0
-  %y = load <2 x double>* %a1
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-
 define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
-  ; CHECK: vfmaddps
+  ; CHECK-FMA4: vfmaddps
+  ; CHECK-FMA: vfmadd213ps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
   ret < 8 x float > %res
 }
 declare < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
 define < 4 x double > @test_x86_fma_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
-  ; CHECK: vfmaddpd
+  ; CHECK-FMA4: vfmaddpd
+  ; CHECK-FMA: vfmadd213pd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
   ret < 4 x double > %res
 }
 declare < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFMSUB
 define < 4 x float > @test_x86_fma_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfmsubss
-  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfmsubss
+  ; CHECK-FMA: vfmsub213ss
+  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
   ret < 4 x float > %res
 }
 declare < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
 define < 2 x double > @test_x86_fma_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfmsubsd
-  %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfmsubsd
+  ; CHECK-FMA: vfmsub213sd
+  %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
   ret < 2 x double > %res
 }
 declare < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
 define < 4 x float > @test_x86_fma_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfmsubps
-  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfmsubps
+  ; CHECK-FMA: vfmsub213ps
+  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
   ret < 4 x float > %res
 }
 declare < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
 define < 2 x double > @test_x86_fma_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfmsubpd
-  %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfmsubpd
+  ; CHECK-FMA: vfmsub213pd
+  %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
   ret < 2 x double > %res
 }
 declare < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
 define < 8 x float > @test_x86_fma_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
-  ; CHECK: vfmsubps
+  ; CHECK-FMA4: vfmsubps
+  ; CHECK-FMA: vfmsub213ps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
   ret < 8 x float > %res
 }
 declare < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
 define < 4 x double > @test_x86_fma_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
-  ; CHECK: vfmsubpd
+  ; CHECK-FMA4: vfmsubpd
+  ; CHECK-FMA: vfmsub213pd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
   ret < 4 x double > %res
 }
 declare < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFNMADD
 define < 4 x float > @test_x86_fma_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfnmaddss
-  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfnmaddss
+  ; CHECK-FMA: vfnmadd213ss
+  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
   ret < 4 x float > %res
 }
 declare < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
 define < 2 x double > @test_x86_fma_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfnmaddsd
-  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfnmaddsd
+  ; CHECK-FMA: vfnmadd213sd
+  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
   ret < 2 x double > %res
 }
 declare < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
 define < 4 x float > @test_x86_fma_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfnmaddps
-  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfnmaddps
+  ; CHECK-FMA: vfnmadd213ps
+  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
   ret < 4 x float > %res
 }
 declare < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
 define < 2 x double > @test_x86_fma_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfnmaddpd
-  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfnmaddpd
+  ; CHECK-FMA: vfnmadd213pd
+  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
   ret < 2 x double > %res
 }
 declare < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
 define < 8 x float > @test_x86_fma_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
-  ; CHECK: vfnmaddps
+  ; CHECK-FMA4: vfnmaddps
+  ; CHECK-FMA: vfnmadd213ps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
   ret < 8 x float > %res
 }
 declare < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
 define < 4 x double > @test_x86_fma_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
-  ; CHECK: vfnmaddpd
+  ; CHECK-FMA4: vfnmaddpd
+  ; CHECK-FMA: vfnmadd213pd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
   ret < 4 x double > %res
 }
 declare < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFNMSUB
 define < 4 x float > @test_x86_fma_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfnmsubss
-  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfnmsubss
+  ; CHECK-FMA: vfnmsub213ss
+  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
   ret < 4 x float > %res
 }
 declare < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
 define < 2 x double > @test_x86_fma_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfnmsubsd
-  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfnmsubsd
+  ; CHECK-FMA: vfnmsub213sd
+  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
   ret < 2 x double > %res
 }
 declare < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
 define < 4 x float > @test_x86_fma_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfnmsubps
-  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfnmsubps
+  ; CHECK-FMA: vfnmsub213ps
+  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
   ret < 4 x float > %res
 }
 declare < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
 define < 2 x double > @test_x86_fma_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfnmsubpd
-  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfnmsubpd
+  ; CHECK-FMA: vfnmsub213pd
+  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
   ret < 2 x double > %res
 }
 declare < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
 define < 8 x float > @test_x86_fma_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
-  ; CHECK: vfnmsubps
+  ; CHECK-FMA4: vfnmsubps
+  ; CHECK-FMA: vfnmsub213ps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
   ret < 8 x float > %res
 }
 declare < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
 define < 4 x double > @test_x86_fma_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
-  ; CHECK: vfnmsubpd
+  ; CHECK-FMA4: vfnmsubpd
+  ; CHECK-FMA: vfnmsub213pd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
   ret < 4 x double > %res
 }
 declare < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFMADDSUB
 define < 4 x float > @test_x86_fma_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfmaddsubps
-  %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfmaddsubps
+  ; CHECK-FMA: vfmaddsub213ps
+  %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
   ret < 4 x float > %res
 }
 declare < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
 define < 2 x double > @test_x86_fma_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfmaddsubpd
-  %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfmaddsubpd
+  ; CHECK-FMA: vfmaddsub213pd
+  %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
   ret < 2 x double > %res
 }
 declare < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
 define < 8 x float > @test_x86_fma_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
-  ; CHECK: vfmaddsubps
+  ; CHECK-FMA4: vfmaddsubps
+  ; CHECK-FMA: vfmaddsub213ps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
   ret < 8 x float > %res
 }
 declare < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
 define < 4 x double > @test_x86_fma_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
-  ; CHECK: vfmaddsubpd
+  ; CHECK-FMA4: vfmaddsubpd
+  ; CHECK-FMA: vfmaddsub213pd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
   ret < 4 x double > %res
 }
 declare < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFMSUBADD
 define < 4 x float > @test_x86_fma_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfmsubaddps
-  %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfmsubaddps
+  ; CHECK-FMA: vfmsubadd213ps
+  %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
   ret < 4 x float > %res
 }
 declare < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
 define < 2 x double > @test_x86_fma_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfmsubaddpd
-  %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  ; CHECK-FMA4: vfmsubaddpd
+  ; CHECK-FMA: vfmsubadd213pd
+  %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
   ret < 2 x double > %res
 }
 declare < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
 define < 8 x float > @test_x86_fma_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
-  ; CHECK: vfmsubaddps
+  ; CHECK-FMA4: vfmsubaddps
+  ; CHECK-FMA: vfmsubadd213ps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
   ret < 8 x float > %res
 }
 declare < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
 define < 4 x double > @test_x86_fma_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
-  ; CHECK: vfmsubaddpd
+  ; CHECK-FMA4: vfmsubaddpd
+  ; CHECK-FMA: vfmsubadd213pd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
   ret < 4 x double > %res
 }
 declare < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
diff --git a/test/CodeGen/X86/fma-phi-213-to-231.ll b/test/CodeGen/X86/fma-phi-213-to-231.ll
new file mode 100644
index 0000000..9715bc7
--- /dev/null
+++ b/test/CodeGen/X86/fma-phi-213-to-231.ll
@@ -0,0 +1,246 @@
+; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK-LABEL: fmaddsubpd_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmaddsub231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubaddpd_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmsubadd231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmaddpd_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmadd231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubpd_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmsub231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x double> %c.addr.0
+}
+
+declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+
+; CHECK-LABEL: fmaddsubps_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmaddsub231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubaddps_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmsubadd231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmaddps_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmadd231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubps_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmsub231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <8 x float> %c.addr.0
+}
+
+declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 47252ec..2eb152b 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -43,8 +43,8 @@ entry:
 }
 
 ; Test FMA3 variant selection
-; CHECK: fma3_select231ssX:
-; CHECK: vfmadd231ss xmm
+; CHECK-FMA-INST: fma3_select231ssX:
+; CHECK-FMA-INST: vfmadd231ss %xmm
 define float @fma3_select231ssX(float %x, float %y) #0 {
 entry:
   br label %while.body
@@ -58,8 +58,8 @@ while.end:                                        ; preds = %while.body, %entry
 }
 
 ; Test FMA3 variant selection
-; CHECK: fma3_select231pdY:
-; CHECK: vfmadd231pd ymm
+; CHECK-FMA-INST: fma3_select231pdY:
+; CHECK-FMA-INST: vfmadd231pd %ymm
 define <4 x double> @fma3_select231pdY(<4 x double> %x, <4 x double> %y) #0 {
 entry:
   br label %while.body
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll
new file mode 100644
index 0000000..64a2068
--- /dev/null
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll
@@ -0,0 +1,84 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
+
+; VFMADD
+define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
+  ; CHECK: vfmaddss (%{{.*}})
+  %x = load float *%a2
+  %y = insertelement <4 x float> undef, float %x, i32 0
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y)
+  ret < 4 x float > %res
+}
+define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
+  ; CHECK: vfmaddss %{{.*}}, (%{{.*}})
+  %x = load float *%a1
+  %y = insertelement <4 x float> undef, float %x, i32 0
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+
+declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
+  ; CHECK: vfmaddsd (%{{.*}})
+  %x = load double *%a2
+  %y = insertelement <2 x double> undef, double %x, i32 0
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y)
+  ret < 2 x double > %res
+}
+define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
+  ; CHECK: vfmaddsd %{{.*}}, (%{{.*}})
+  %x = load double *%a1
+  %y = insertelement <2 x double> undef, double %x, i32 0
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
+  ; CHECK: vfmaddps (%{{.*}})
+  %x = load <4 x float>* %a2
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x)
+  ret < 4 x float > %res
+}
+define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
+  ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
+  %x = load <4 x float>* %a1
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+; To test execution dependency
+define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) {
+  ; CHECK: vmovaps
+  ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
+  %x = load <4 x float>* %a0
+  %y = load <4 x float>* %a1
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+
+define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
+  ; CHECK: vfmaddpd (%{{.*}})
+  %x = load <2 x double>* %a2
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x)
+  ret < 2 x double > %res
+}
+define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
+  ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
+  %x = load <2 x double>* %a1
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+; To test execution dependency
+define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) {
+  ; CHECK: vmovapd
+  ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
+  %x = load <2 x double>* %a0
+  %y = load <2 x double>* %a1
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index cfb598d..9b52db9 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -184,7 +184,7 @@ define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
 
 ; CHECK: test_x86_fmadd_ps_load
 ; CHECK: vmovaps         (%rdi), %xmm2
-; CHECK: vfmadd213ps     %xmm1, %xmm0, %xmm2
+; CHECK: vfmadd213ps     %xmm1, %xmm2, %xmm0
 ; CHECK: ret
 ; CHECK_FMA4: test_x86_fmadd_ps_load
 ; CHECK_FMA4: vfmaddps     %xmm1, (%rdi), %xmm0, %xmm0
@@ -198,7 +198,7 @@ define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4
 
 ; CHECK: test_x86_fmsub_ps_load
 ; CHECK: vmovaps         (%rdi), %xmm2
-; CHECK: fmsub213ps     %xmm1, %xmm0, %xmm2
+; CHECK: fmsub213ps     %xmm1, %xmm2, %xmm0
 ; CHECK: ret
 ; CHECK_FMA4: test_x86_fmsub_ps_load
 ; CHECK_FMA4: vfmsubps     %xmm1, (%rdi), %xmm0, %xmm0
diff --git a/test/CodeGen/X86/fmaxnum.ll b/test/CodeGen/X86/fmaxnum.ll
new file mode 100644
index 0000000..23678c4
--- /dev/null
+++ b/test/CodeGen/X86/fmaxnum.ll
@@ -0,0 +1,50 @@
+; RUN: llc  -march=x86 -mtriple=i386-linux-gnu  < %s | FileCheck %s
+
+declare float @fmaxf(float, float)
+declare double @fmax(double, double)
+declare x86_fp80 @fmaxl(x86_fp80, x86_fp80)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
+declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80)
+
+; CHECK-LABEL: @test_fmaxf
+; CHECK: calll fmaxf
+define float @test_fmaxf(float %x, float %y) {
+  %z = call float @fmaxf(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_fmax
+; CHECK: calll fmax
+define double @test_fmax(double %x, double %y) {
+  %z = call double @fmax(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_fmaxl
+; CHECK: calll fmaxl
+define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) {
+  %z = call x86_fp80 @fmaxl(x86_fp80 %x, x86_fp80 %y) readnone
+  ret x86_fp80 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxf
+; CHECK: calll fmaxf
+define float @test_intrinsic_fmaxf(float %x, float %y) {
+  %z = call float @llvm.maxnum.f32(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax
+; CHECK: calll fmax
+define double @test_intrinsic_fmax(double %x, double %y) {
+  %z = call double @llvm.maxnum.f64(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxl
+; CHECK: calll fmaxl
+define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) {
+  %z = call x86_fp80 @llvm.maxnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
+  ret x86_fp80 %z
+}
diff --git a/test/CodeGen/X86/fminnum.ll b/test/CodeGen/X86/fminnum.ll
new file mode 100644
index 0000000..1e33cf4
--- /dev/null
+++ b/test/CodeGen/X86/fminnum.ll
@@ -0,0 +1,95 @@
+; RUN: llc  -march=x86 -mtriple=i386-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s
+
+declare float @fminf(float, float)
+declare double @fmin(double, double)
+declare x86_fp80 @fminl(x86_fp80, x86_fp80)
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80)
+
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>)
+
+; CHECK-LABEL: @test_fminf
+; CHECK: jmp fminf
+define float @test_fminf(float %x, float %y) {
+  %z = call float @fminf(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_fmin
+; CHECK: jmp fmin
+define double @test_fmin(double %x, double %y) {
+  %z = call double @fmin(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_fminl
+; CHECK: calll fminl
+define x86_fp80 @test_fminl(x86_fp80 %x, x86_fp80 %y) {
+  %z = call x86_fp80 @fminl(x86_fp80 %x, x86_fp80 %y) readnone
+  ret x86_fp80 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fminf
+; CHECK: jmp fminf
+define float @test_intrinsic_fminf(float %x, float %y) {
+  %z = call float @llvm.minnum.f32(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin
+; CHECK: jmp fmin
+define double @test_intrinsic_fmin(double %x, double %y) {
+  %z = call double @llvm.minnum.f64(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fminl
+; CHECK: calll fminl
+define x86_fp80 @test_intrinsic_fminl(x86_fp80 %x, x86_fp80 %y) {
+  %z = call x86_fp80 @llvm.minnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
+  ret x86_fp80 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v2f32
+; CHECK: calll fminf
+; CHECK: calll fminf
+define <2 x float> @test_intrinsic_fmin_v2f32(<2 x float> %x, <2 x float> %y) {
+  %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
+  ret <2 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v4f32
+; CHECK: calll fminf
+; CHECK: calll fminf
+; CHECK: calll fminf
+; CHECK: calll fminf
+define <4 x float> @test_intrinsic_fmin_v4f32(<4 x float> %x, <4 x float> %y) {
+  %z = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
+  ret <4 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v2f64
+; CHECK: calll fmin
+; CHECK: calll fmin
+define <2 x double> @test_intrinsic_fmin_v2f64(<2 x double> %x, <2 x double> %y) {
+  %z = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) readnone
+  ret <2 x double> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v8f64
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y) {
+  %z = call <8 x double> @llvm.minnum.v8f64(<8 x double> %x, <8 x double> %y) readnone
+  ret <8 x double> %z
+}
diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll
new file mode 100644
index 0000000..7036511
--- /dev/null
+++ b/test/CodeGen/X86/fmul-combines.ll
@@ -0,0 +1,147 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -march=x86-64 < %s | FileCheck %s
+
+; CHECK-LABEL: fmul2_f32:
+; CHECK: addss %xmm0, %xmm0
+define float @fmul2_f32(float %x) {
+  %y = fmul float %x, 2.0
+  ret float %y
+}
+
+; fmul 2.0, x -> fadd x, x for vectors.
+
+; CHECK-LABEL: fmul2_v4f32:
+; CHECK: addps %xmm0, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmul2_v4f32(<4 x float> %x) {
+  %y = fmul <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0>
+  ret <4 x float> %y
+}
+
+; CHECK-LABEL: constant_fold_fmul_v4f32:
+; CHECK: movaps
+; CHECK-NEXT: ret
+define <4 x float> @constant_fold_fmul_v4f32(<4 x float> %x) {
+  %y = fmul <4 x float> <float 4.0, float 4.0, float 4.0, float 4.0>, <float 2.0, float 2.0, float 2.0, float 2.0>
+  ret <4 x float> %y
+}
+
+; CHECK-LABEL: fmul0_v4f32:
+; CHECK: xorps %xmm0, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmul0_v4f32(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 0.0, float 0.0, float 0.0, float 0.0>
+  ret <4 x float> %y
+}
+
+; CHECK-LABEL: fmul_c2_c4_v4f32:
+; CHECK-NOT: addps
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_c2_c4_v4f32(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0>
+  %z = fmul <4 x float> %y, <float 4.0, float 4.0, float 4.0, float 4.0>
+  ret <4 x float> %z
+}
+
+; CHECK-LABEL: fmul_c3_c4_v4f32:
+; CHECK-NOT: addps
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_c3_c4_v4f32(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 3.0, float 3.0, float 3.0, float 3.0>
+  %z = fmul <4 x float> %y, <float 4.0, float 4.0, float 4.0, float 4.0>
+  ret <4 x float> %z
+}
+
+; We should be able to pre-multiply the two constant vectors.
+; CHECK: float 5.000000e+00
+; CHECK: float 1.200000e+01
+; CHECK: float 2.100000e+01
+; CHECK: float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat:
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0>
+  ret <4 x float> %z
+}
+
+; Same as above, but reverse operands to make sure non-canonical form is also handled.
+; CHECK: float 5.000000e+00
+; CHECK: float 1.200000e+01
+; CHECK: float 2.100000e+01
+; CHECK: float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_non_canonical:
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat_non_canonical(<4 x float> %x) #0 {
+  %y = fmul <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
+  %z = fmul <4 x float> <float 5.0, float 6.0, float 7.0, float 8.0>, %y
+  ret <4 x float> %z
+}
+
+; More than one use of a constant multiply should not inhibit the optimization.
+; Instead of a chain of 2 dependent mults, this test will have 2 independent mults. 
+; CHECK: float 5.000000e+00
+; CHECK: float 1.200000e+01
+; CHECK: float 2.100000e+01
+; CHECK: float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use:
+; CHECK: mulps
+; CHECK: mulps
+; CHECK: addps
+; CHECK: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0>
+  %a = fadd <4 x float> %y, %z
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: fmul_c2_c4_f32:
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: mulss
+; CHECK-NEXT: ret
+define float @fmul_c2_c4_f32(float %x) #0 {
+  %y = fmul float %x, 2.0
+  %z = fmul float %y, 4.0
+  ret float %z
+}
+
+; CHECK-LABEL: fmul_c3_c4_f32:
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: mulss
+; CHECK-NET: ret
+define float @fmul_c3_c4_f32(float %x) #0 {
+  %y = fmul float %x, 3.0
+  %z = fmul float %y, 4.0
+  ret float %z
+}
+
+; CHECK-LABEL: fmul_fneg_fneg_f32:
+; CHECK: mulss %xmm1, %xmm0
+; CHECK-NEXT: retq
+define float @fmul_fneg_fneg_f32(float %x, float %y) {
+  %x.neg = fsub float -0.0, %x
+  %y.neg = fsub float -0.0, %y
+  %mul = fmul float %x.neg, %y.neg
+  ret float %mul
+}
+; CHECK-LABEL: fmul_fneg_fneg_v4f32:
+; CHECK: mulps {{%xmm1|\(%rdx\)}}, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmul_fneg_fneg_v4f32(<4 x float> %x, <4 x float> %y) {
+  %x.neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %x
+  %y.neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %y
+  %mul = fmul <4 x float> %x.neg, %y.neg
+  ret <4 x float> %mul
+}
+
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fnabs.ll b/test/CodeGen/X86/fnabs.ll
new file mode 100644
index 0000000..19718d3
--- /dev/null
+++ b/test/CodeGen/X86/fnabs.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx| FileCheck %s
+
+; Verify that we generate a single OR instruction for a scalar, vec128, and vec256
+; FNABS(x) operation -> FNEG (FABS(x)).
+; If the FABS() result isn't used, the AND instruction should be eliminated.
+; PR20578: http://llvm.org/bugs/show_bug.cgi?id=20578
+
+define float @scalar_no_abs(float %a) {
+; CHECK-LABEL: scalar_no_abs:
+; CHECK: vorps
+; CHECK-NEXT: retq
+  %fabs = tail call float @fabsf(float %a) #1
+  %fsub = fsub float -0.0, %fabs
+  ret float %fsub
+}
+
+define float @scalar_uses_abs(float %a) {
+; CHECK-LABEL: scalar_uses_abs:
+; CHECK-DAG: vandps
+; CHECK-DAG: vorps
+; CHECK: vmulss
+; CHECK-NEXT: retq
+  %fabs = tail call float @fabsf(float %a) #1
+  %fsub = fsub float -0.0, %fabs
+  %fmul = fmul float %fsub, %fabs
+  ret float %fmul
+}
+
+define <4 x float> @vector128_no_abs(<4 x float> %a) {
+; CHECK-LABEL: vector128_no_abs:
+; CHECK: vorps
+; CHECK-NEXT: retq
+  %fabs = tail call <4 x float> @llvm.fabs.v4f32(< 4 x float> %a) #1
+  %fsub = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+  ret <4 x float> %fsub
+}
+
+define <4 x float> @vector128_uses_abs(<4 x float> %a) {
+; CHECK-LABEL: vector128_uses_abs:
+; CHECK-DAG: vandps
+; CHECK-DAG: vorps
+; CHECK: vmulps
+; CHECK-NEXT: retq
+  %fabs = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #1
+  %fsub = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+  %fmul = fmul <4 x float> %fsub, %fabs
+  ret <4 x float> %fmul
+}
+
+define <8 x float> @vector256_no_abs(<8 x float> %a) {
+; CHECK-LABEL: vector256_no_abs:
+; CHECK: vorps
+; CHECK-NEXT: retq
+  %fabs = tail call <8 x float> @llvm.fabs.v8f32(< 8 x float> %a) #1
+  %fsub = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+  ret <8 x float> %fsub
+}
+
+define <8 x float> @vector256_uses_abs(<8 x float> %a) {
+; CHECK-LABEL: vector256_uses_abs:
+; CHECK-DAG: vandps
+; CHECK-DAG: vorps
+; CHECK: vmulps
+; CHECK-NEXT: retq
+  %fabs = tail call <8 x float> @llvm.fabs.v8f32(<8 x float> %a) #1
+  %fsub = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+  %fmul = fmul <8 x float> %fsub, %fabs
+  ret <8 x float> %fmul
+}
+
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+
+declare float @fabsf(float)
+
+attributes #1 = { readnone }
+
diff --git a/test/CodeGen/X86/fold-pcmpeqd-0.ll b/test/CodeGen/X86/fold-pcmpeqd-0.ll
deleted file mode 100644
index 1d315ff..0000000
--- a/test/CodeGen/X86/fold-pcmpeqd-0.ll
+++ /dev/null
@@ -1,117 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=X86-64 %s
-; DISABLED: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah -regalloc=linearscan | FileCheck --check-prefix=I386 %s
-
-; i386 test has been disabled when scheduler 2-addr hack is disabled.
-
-; This testcase shouldn't need to spill the -1 value,
-; so it should just use pcmpeqd to materialize an all-ones vector.
-; For i386, cp load of -1 are folded.
-
-; With -regalloc=greedy, the live range is split before spilling, so the first
-; pcmpeq doesn't get folded as a constant pool load.
-
-; I386-NOT: pcmpeqd
-; I386: orps LCPI0_2, %xmm
-; I386-NOT: pcmpeqd
-; I386: orps LCPI0_2, %xmm
-
-; X86-64: pcmpeqd
-; X86-64-NOT: pcmpeqd
-
-	%struct.__ImageExecInfo = type <{ <4 x i32>, <4 x float>, <2 x i64>, i8*, i8*, i8*, i32, i32, i32, i32, i32 }>
-	%struct._cl_image_format_t = type <{ i32, i32, i32 }>
-	%struct._image2d_t = type <{ i8*, %struct._cl_image_format_t, i32, i32, i32, i32, i32, i32 }>
-
-define void @program_1(%struct._image2d_t* %dest, %struct._image2d_t* %t0, <4 x float> %p0, <4 x float> %p1, <4 x float> %p4, <4 x float> %p5, <4 x float> %p6) nounwind {
-entry:
-	%tmp3.i = load i32* null		; <i32> [#uses=1]
-	%cmp = icmp sgt i32 %tmp3.i, 200		; <i1> [#uses=1]
-	br i1 %cmp, label %forcond, label %ifthen
-
-ifthen:		; preds = %entry
-	ret void
-
-forcond:		; preds = %entry
-	%tmp3.i536 = load i32* null		; <i32> [#uses=1]
-	%cmp12 = icmp slt i32 0, %tmp3.i536		; <i1> [#uses=1]
-	br i1 %cmp12, label %forbody, label %afterfor
-
-forbody:		; preds = %forcond
-	%bitcast204.i313 = bitcast <4 x i32> zeroinitializer to <4 x float>		; <<4 x float>> [#uses=1]
-	%mul233 = fmul <4 x float> %bitcast204.i313, zeroinitializer		; <<4 x float>> [#uses=1]
-	%mul257 = fmul <4 x float> %mul233, zeroinitializer		; <<4 x float>> [#uses=1]
-	%mul275 = fmul <4 x float> %mul257, zeroinitializer		; <<4 x float>> [#uses=1]
-	%tmp51 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %mul275, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
-	%bitcast198.i182 = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=0]
-	%bitcast204.i185 = bitcast <4 x i32> zeroinitializer to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp69 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> zeroinitializer) nounwind		; <<4 x i32>> [#uses=1]
-	%tmp70 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp69) nounwind		; <<4 x float>> [#uses=1]
-	%sub140.i78 = fsub <4 x float> zeroinitializer, %tmp70		; <<4 x float>> [#uses=2]
-	%mul166.i86 = fmul <4 x float> zeroinitializer, %sub140.i78		; <<4 x float>> [#uses=1]
-	%add167.i87 = fadd <4 x float> %mul166.i86, < float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000 >		; <<4 x float>> [#uses=1]
-	%mul171.i88 = fmul <4 x float> %add167.i87, %sub140.i78		; <<4 x float>> [#uses=1]
-	%add172.i89 = fadd <4 x float> %mul171.i88, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 >		; <<4 x float>> [#uses=1]
-	%bitcast176.i90 = bitcast <4 x float> %add172.i89 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps178.i92 = and <4 x i32> %bitcast176.i90, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%bitcast179.i93 = bitcast <4 x i32> %andnps178.i92 to <4 x float>		; <<4 x float>> [#uses=1]
-	%mul186.i96 = fmul <4 x float> %bitcast179.i93, zeroinitializer		; <<4 x float>> [#uses=1]
-	%bitcast190.i98 = bitcast <4 x float> %mul186.i96 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps192.i100 = and <4 x i32> %bitcast190.i98, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%xorps.i102 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
-	%orps203.i103 = or <4 x i32> %andnps192.i100, %xorps.i102		; <<4 x i32>> [#uses=1]
-	%bitcast204.i104 = bitcast <4 x i32> %orps203.i103 to <4 x float>		; <<4 x float>> [#uses=1]
-	%cmple.i = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> %tmp51, i8 2) nounwind		; <<4 x float>> [#uses=1]
-	%tmp80 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
-	%sub140.i = fsub <4 x float> zeroinitializer, %tmp80		; <<4 x float>> [#uses=1]
-	%bitcast148.i = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps150.i = and <4 x i32> %bitcast148.i, < i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041 >		; <<4 x i32>> [#uses=0]
-	%mul171.i = fmul <4 x float> zeroinitializer, %sub140.i		; <<4 x float>> [#uses=1]
-	%add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 >		; <<4 x float>> [#uses=1]
-	%bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float>		; <<4 x float>> [#uses=1]
-	%mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer		; <<4 x float>> [#uses=1]
-	%bitcast189.i = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=0]
-	%bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%bitcast198.i = bitcast <4 x float> %cmple.i to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%xorps.i = xor <4 x i32> %bitcast198.i, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
-	%orps203.i = or <4 x i32> %andnps192.i, %xorps.i		; <<4 x i32>> [#uses=1]
-	%bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float>		; <<4 x float>> [#uses=1]
-	%mul307 = fmul <4 x float> %bitcast204.i185, zeroinitializer		; <<4 x float>> [#uses=1]
-	%mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer		; <<4 x float>> [#uses=2]
-	%mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer		; <<4 x float>> [#uses=1]
-	%tmp82 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul307, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
-	%bitcast11.i15 = bitcast <4 x float> %tmp82 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps.i17 = and <4 x i32> %bitcast11.i15, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%orps.i18 = or <4 x i32> %andnps.i17, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
-	%bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%bitcast6.i4 = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=2]
-	%andps.i5 = and <4 x i32> %bitcast.i3, %bitcast6.i4		; <<4 x i32>> [#uses=1]
-	%bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%not.i7 = xor <4 x i32> %bitcast6.i4, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
-	%andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7		; <<4 x i32>> [#uses=1]
-	%orps.i9 = or <4 x i32> %andnps.i8, %andps.i5		; <<4 x i32>> [#uses=1]
-	%bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float>		; <<4 x float>> [#uses=1]
-	%bitcast.i = bitcast <4 x float> %mul313 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andps.i = and <4 x i32> %bitcast.i, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%orps.i = or <4 x i32> zeroinitializer, %andps.i		; <<4 x i32>> [#uses=1]
-	%bitcast17.i = bitcast <4 x i32> %orps.i to <4 x float>		; <<4 x float>> [#uses=1]
-	call void null(<4 x float> %bitcast17.i19, <4 x float> %bitcast17.i10, <4 x float> %bitcast17.i, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
-	unreachable
-
-afterfor:		; preds = %forcond
-	ret void
-}
-
-declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
-
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll
new file mode 100644
index 0000000..a643d86
--- /dev/null
+++ b/test/CodeGen/X86/fold-tied-op.ll
@@ -0,0 +1,84 @@
+; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s
+; Regression test for http://reviews.llvm.org/D5701
+
+; ModuleID = 'xxhash.i'
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386--netbsd"
+
+; CHECK-LABEL: fn1
+; CHECK:       shldl {{.*#+}} 4-byte Folded Spill
+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
+; CHECK:       shldl {{.*#+}} 4-byte Folded Spill
+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
+; CHECK:       addl  {{.*#+}} 4-byte Folded Reload
+; CHECK:       imull {{.*#+}} 4-byte Folded Reload
+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
+; CHECK:       retl
+
+%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
+
+@a = common global i32 0, align 4
+@b = common global i64 0, align 8
+
+; Function Attrs: nounwind uwtable
+define i64 @fn1() #0 {
+entry:
+  %0 = load i32* @a, align 4, !tbaa !1
+  %1 = inttoptr i32 %0 to %struct.XXH_state64_t*
+  %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0
+  %2 = load i32* %total_len, align 4, !tbaa !5
+  %tobool = icmp eq i32 %2, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3
+  %3 = load i64* %v3, align 4, !tbaa !8
+  %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4
+  %4 = load i64* %v4, align 4, !tbaa !9
+  %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2
+  %5 = load i64* %v2, align 4, !tbaa !10
+  %shl = shl i64 %5, 1
+  %or = or i64 %shl, %5
+  %shl2 = shl i64 %3, 2
+  %shr = lshr i64 %3, 1
+  %or3 = or i64 %shl2, %shr
+  %add = add i64 %or, %or3
+  %mul = mul i64 %4, -4417276706812531889
+  %shl4 = mul i64 %4, -8834553413625063778
+  %shr5 = ashr i64 %mul, 3
+  %or6 = or i64 %shr5, %shl4
+  %mul7 = mul nsw i64 %or6, 1400714785074694791
+  %xor = xor i64 %add, %mul7
+  store i64 %xor, i64* @b, align 8, !tbaa !11
+  %mul8 = mul nsw i64 %xor, 1400714785074694791
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %6 = load i64* @b, align 8, !tbaa !11
+  %xor10 = xor i64 %6, -4417276706812531889
+  %mul11 = mul nsw i64 %xor10, 400714785074694791
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ]
+  %storemerge = add i64 %storemerge.in, -8796714831421723037
+  store i64 %storemerge, i64* @b, align 8, !tbaa !11
+  ret i64 undef
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6 (trunk 219587)"}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !6, metadata !2, i64 0}
+!6 = metadata !{metadata !"XXH_state64_t", metadata !2, i64 0, metadata !2, i64 4, metadata !7, i64 8, metadata !7, i64 16, metadata !7, i64 24}
+!7 = metadata !{metadata !"long long", metadata !3, i64 0}
+!8 = metadata !{metadata !6, metadata !7, i64 16}
+!9 = metadata !{metadata !6, metadata !7, i64 24}
+!10 = metadata !{metadata !6, metadata !7, i64 8}
+!11 = metadata !{metadata !7, metadata !7, i64 0}
diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll
index a973bef..e6c1e1a 100644
--- a/test/CodeGen/X86/fp-load-trunc.ll
+++ b/test/CodeGen/X86/fp-load-trunc.ll
@@ -2,57 +2,87 @@
 ; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
 
 define <1 x float> @test1(<1 x double>* %p) nounwind {
-; CHECK: test1
-; CHECK: cvtsd2ss
-; CHECK: ret
-; AVX:   test1
-; AVX:   vcvtsd2ss
-; AVX:   ret
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movsd (%eax), %xmm0
+; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    flds (%esp)
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    pushl %eax
+; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vmovsd (%eax), %xmm0
+; AVX-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovss %xmm0, (%esp)
+; AVX-NEXT:    flds (%esp)
+; AVX-NEXT:    popl %eax
+; AVX-NEXT:    retl
   %x = load <1 x double>* %p
   %y = fptrunc <1 x double> %x to <1 x float>
   ret <1 x float> %y
 }
 
 define <2 x float> @test2(<2 x double>* %p) nounwind {
-; CHECK: test2
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: ret
-; AVX:   test2
-; AVX:   vcvtpd2psx {{[0-9]*}}(%{{.*}})
-; AVX:   ret
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cvtpd2ps (%eax), %xmm0
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vcvtpd2psx (%eax), %xmm0
+; AVX-NEXT:    retl
   %x = load <2 x double>* %p
   %y = fptrunc <2 x double> %x to <2 x float>
   ret <2 x float> %y
 }
 
 define <4 x float> @test3(<4 x double>* %p) nounwind {
-; CHECK: test3
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: movlhps
-; CHECK: ret
-; AVX:   test3
-; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
-; AVX:   ret
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cvtpd2ps 16(%eax), %xmm1
+; CHECK-NEXT:    cvtpd2ps (%eax), %xmm0
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vcvtpd2psy (%eax), %xmm0
+; AVX-NEXT:    retl
   %x = load <4 x double>* %p
   %y = fptrunc <4 x double> %x to <4 x float>
   ret <4 x float> %y
 }
 
 define <8 x float> @test4(<8 x double>* %p) nounwind {
-; CHECK: test4
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: movlhps
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: movlhps
-; CHECK: ret
-; AVX:   test4
-; AVX:   vcvtpd2psy
-; AVX:   vcvtpd2psy
-; AVX:   vinsertf128
-; AVX:   ret
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cvtpd2ps 16(%eax), %xmm1
+; CHECK-NEXT:    cvtpd2ps (%eax), %xmm0
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    cvtpd2ps 48(%eax), %xmm2
+; CHECK-NEXT:    cvtpd2ps 32(%eax), %xmm1
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vcvtpd2psy (%eax), %xmm0
+; AVX-NEXT:    vcvtpd2psy 32(%eax), %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retl
   %x = load <8 x double>* %p
   %y = fptrunc <8 x double> %x to <8 x float>
   ret <8 x float> %y
diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll
index 25442fc..6424bfc 100644
--- a/test/CodeGen/X86/fp-trunc.ll
+++ b/test/CodeGen/X86/fp-trunc.ll
@@ -2,55 +2,77 @@
 ; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
 
 define <1 x float> @test1(<1 x double> %x) nounwind {
-; CHECK: test1
-; CHECK: cvtsd2ss
-; CHECK: ret
-; AVX:   test1
-; AVX:   vcvtsd2ss
-; AVX:   ret
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movsd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    flds (%esp)
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    pushl %eax
+; AVX-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0
+; AVX-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovss %xmm0, (%esp)
+; AVX-NEXT:    flds (%esp)
+; AVX-NEXT:    popl %eax
+; AVX-NEXT:    retl
   %y = fptrunc <1 x double> %x to <1 x float>
   ret <1 x float> %y
 }
 
 define <2 x float> @test2(<2 x double> %x) nounwind {
-; CHECK: test2
-; CHECK: cvtpd2ps
-; CHECK: ret
-; AVX:   test2
-; AVX-NOT:  vcvtpd2psy
-; AVX:   vcvtpd2ps
-; AVX:   ret
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtpd2ps %xmm0, %xmm0
+; AVX-NEXT:    retl
   %y = fptrunc <2 x double> %x to <2 x float>
   ret <2 x float> %y
 }
 
 define <4 x float> @test3(<4 x double> %x) nounwind {
-; CHECK: test3
-; CHECK: cvtpd2ps
-; CHECK: cvtpd2ps
-; CHECK: movlhps
-; CHECK: ret
-; AVX:   test3
-; AVX:   vcvtpd2psy
-; AVX:   ret
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cvtpd2ps %xmm1, %xmm1
+; CHECK-NEXT:    cvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtpd2psy %ymm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retl
   %y = fptrunc <4 x double> %x to <4 x float>
   ret <4 x float> %y
 }
 
 define <8 x float> @test4(<8 x double> %x) nounwind {
-; CHECK: test4
-; CHECK: cvtpd2ps
-; CHECK: cvtpd2ps
-; CHECK: movlhps
-; CHECK: cvtpd2ps
-; CHECK: cvtpd2ps
-; CHECK: movlhps
-; CHECK: ret
-; AVX:   test4
-; AVX:   vcvtpd2psy
-; AVX:   vcvtpd2psy
-; AVX:   vinsertf128
-; AVX:   ret
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cvtpd2ps %xmm1, %xmm1
+; CHECK-NEXT:    cvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    cvtpd2ps %xmm3, %xmm3
+; CHECK-NEXT:    cvtpd2ps %xmm2, %xmm1
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtpd2psy %ymm0, %xmm0
+; AVX-NEXT:    vcvtpd2psy %ymm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retl
   %y = fptrunc <8 x double> %x to <8 x float>
   ret <8 x float> %y
 }
diff --git a/test/CodeGen/X86/fpstack-debuginstr-kill.ll b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
new file mode 100644
index 0000000..dfc59a3
--- /dev/null
+++ b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin -no-integrated-as
+
+@g1 = global double 0.000000e+00, align 8
+@g2 = global i32 0, align 4
+
+define void @_Z16fpuop_arithmeticjj(i32, i32) {
+entry:
+  switch i32 undef, label %sw.bb.i1921 [
+  ]
+
+sw.bb261:                                         ; preds = %entry, %entry
+  unreachable
+
+sw.bb.i1921:                                      ; preds = %if.end504
+  switch i32 undef, label %if.end511 [
+    i32 1, label %sw.bb27.i
+  ]
+
+sw.bb27.i:                                        ; preds = %sw.bb.i1921
+  %conv.i.i1923 = fpext float undef to x86_fp80
+  br label %if.end511
+
+if.end511:                                        ; preds = %sw.bb27.i, %sw.bb13.i
+  %src.sroa.0.0.src.sroa.0.0.2280 = phi x86_fp80 [ %conv.i.i1923, %sw.bb27.i ], [ undef, %sw.bb.i1921 ]
+  switch i32 undef, label %sw.bb992 [
+    i32 3, label %sw.bb735
+    i32 18, label %if.end41.i2210
+  ]
+
+sw.bb735:                                         ; preds = %if.end511
+  %2 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280)
+  unreachable
+
+if.end41.i2210:                                   ; preds = %if.end511
+  call void @llvm.dbg.value(metadata !{x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280}, i64 0, metadata !20, metadata !{metadata !"0x102"})
+  unreachable
+
+sw.bb992:                                         ; preds = %if.end511
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!24, !25}
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !21, metadata !2} ; [ DW_TAG_compile_unit ] [x87stackifier/fpu_ieee.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"fpu_ieee.cpp", metadata !"x87stackifier"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00fpuop_arithmetic\00fpuop_arithmetic\00_Z16fpuop_arithmeticjj\0011\000\001\000\006\00256\001\0013", metadata !5, metadata !6, metadata !7, null, void (i32, i32)* @_Z16fpuop_arithmeticjj, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 11] [def] [scope 13] [fpuop_arithmetic]
+!5 = metadata !{metadata !"f1.cpp", metadata !"x87stackifier"}
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [x87stackifier/f1.cpp]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9, metadata !9}
+!9 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !18, metadata !20}
+!11 = metadata !{metadata !"0x101\00\0016777227\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 11]
+!12 = metadata !{metadata !"0x101\00\0033554443\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 11]
+!13 = metadata !{metadata !"0x100\00x\0014\000", metadata !4, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [x] [line 14]
+!14 = metadata !{metadata !"0x16\00fpu_extended\003\000\000\000\000", metadata !5, null, metadata !15} ; [ DW_TAG_typedef ] [fpu_extended] [line 3, size 0, align 0, offset 0] [from fpu_register]
+!15 = metadata !{metadata !"0x16\00fpu_register\002\000\000\000\000", metadata !5, null, metadata !16} ; [ DW_TAG_typedef ] [fpu_register] [line 2, size 0, align 0, offset 0] [from uae_f64]
+!16 = metadata !{metadata !"0x16\00uae_f64\001\000\000\000\000", metadata !5, null, metadata !17} ; [ DW_TAG_typedef ] [uae_f64] [line 1, size 0, align 0, offset 0] [from double]
+!17 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!18 = metadata !{metadata !"0x100\00a\0015\000", metadata !4, metadata !6, metadata !19} ; [ DW_TAG_auto_variable ] [a] [line 15]
+!19 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!20 = metadata !{metadata !"0x100\00value\0016\000", metadata !4, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [value] [line 16]
+!21 = metadata !{metadata !22, metadata !23}
+!22 = metadata !{metadata !"0x34\00g1\00g1\00\005\000\001", null, metadata !6, metadata !14, double* @g1, null} ; [ DW_TAG_variable ] [g1] [line 5] [def]
+!23 = metadata !{metadata !"0x34\00g2\00g2\00\006\000\001", null, metadata !6, metadata !19, i32* @g2, null} ; [ DW_TAG_variable ] [g2] [line 6] [def]
+!24 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!25 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/frameaddr.ll b/test/CodeGen/X86/frameaddr.ll
index 6c1ca25..452c8e5 100644
--- a/test/CodeGen/X86/frameaddr.ll
+++ b/test/CodeGen/X86/frameaddr.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -march=x86    -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-32
 ; RUN: llc < %s -march=x86-64                             | FileCheck %s --check-prefix=CHECK-64
 ; RUN: llc < %s -march=x86-64 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -mtriple=x86_64-gnux32                    | FileCheck %s --check-prefix=CHECK-X32ABI
+; RUN: llc < %s -mtriple=x86_64-gnux32 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-X32ABI
 
 define i8* @test1() nounwind {
 entry:
@@ -17,6 +19,12 @@ entry:
 ; CHECK-64-NEXT:  movq %rbp, %rax
 ; CHECK-64-NEXT:  pop
 ; CHECK-64-NEXT:  ret
+; CHECK-X32ABI-LABEL: test1
+; CHECK-X32ABI:       pushq %rbp
+; CHECK-X32ABI-NEXT:  movl %esp, %ebp
+; CHECK-X32ABI-NEXT:  movl %ebp, %eax
+; CHECK-X32ABI-NEXT:  popq %rbp
+; CHECK-X32ABI-NEXT:  ret
   %0 = tail call i8* @llvm.frameaddress(i32 0)
   ret i8* %0
 }
@@ -37,6 +45,13 @@ entry:
 ; CHECK-64-NEXT:  movq (%rax), %rax
 ; CHECK-64-NEXT:  pop
 ; CHECK-64-NEXT:  ret
+; CHECK-X32ABI-LABEL: test2
+; CHECK-X32ABI:       pushq %rbp
+; CHECK-X32ABI-NEXT:  movl %esp, %ebp
+; CHECK-X32ABI-NEXT:  movl (%ebp), %eax
+; CHECK-X32ABI-NEXT:  movl (%eax), %eax
+; CHECK-X32ABI-NEXT:  popq %rbp
+; CHECK-X32ABI-NEXT:  ret
   %0 = tail call i8* @llvm.frameaddress(i32 2)
   ret i8* %0
 }
diff --git a/test/CodeGen/X86/gcc_except_table_functions.ll b/test/CodeGen/X86/gcc_except_table_functions.ll
new file mode 100644
index 0000000..4a81680
--- /dev/null
+++ b/test/CodeGen/X86/gcc_except_table_functions.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s
+
+; This test demonstrates that it is possible to use functions for typeinfo
+; instead of global variables. While __gxx_personality_v0 would never know what
+; to do with them, other EH schemes such as SEH might use them.
+
+declare i32 @__gxx_personality_v0(...)
+declare void @filt0()
+declare void @filt1()
+declare void @_Z1fv()
+declare i32 @llvm.eh.typeid.for(i8*)
+
+define i32 @main() uwtable {
+entry:
+  invoke void @_Z1fv()
+          to label %try.cont unwind label %lpad
+
+try.cont:
+  ret i32 0
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (void ()* @filt0 to i8*)
+          catch i8* bitcast (void ()* @filt1 to i8*)
+  %sel = extractvalue { i8*, i32 } %0, 1
+  %id0 = call i32 @llvm.eh.typeid.for(i8* bitcast (void ()* @filt0 to i8*))
+  %is_f0 = icmp eq i32 %sel, %id0
+  br i1 %is_f0, label %try.cont, label %check_f1
+
+check_f1:
+  %id1 = call i32 @llvm.eh.typeid.for(i8* bitcast (void ()* @filt1 to i8*))
+  %is_f1 = icmp eq i32 %sel, %id1
+  br i1 %is_f1, label %try.cont, label %eh.resume
+
+eh.resume:
+  resume { i8*, i32 } %0
+}
+
+; CHECK-LABEL: main:
+; CHECK: .cfi_startproc
+; CHECK: .cfi_personality 3, __gxx_personality_v0
+; CHECK: .cfi_lsda 3, .Lexception0
+; CHECK: .cfi_def_cfa_offset 16
+; CHECK: callq _Z1fv
+; CHECK: retq
+; CHECK: cmpl $2, %edx
+; CHECK: je
+; CHECK: cmpl $1, %edx
+; CHECK: je
+; CHECK: callq _Unwind_Resume
+; CHECK: .cfi_endproc
+; CHECK: GCC_except_table0:
+; CHECK: Lexception0:
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index c763f39..fa1169d 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -53,21 +53,20 @@ define void @F1() {
 
 
 ; _Complex long long const G4 = 34;
-@G4 = unnamed_addr constant {i64,i64} { i64 34, i64 0 }
+@G4 = private unnamed_addr constant {i64,i64} { i64 34, i64 0 }
 
 ; DARWIN: .section        __TEXT,__literal16,16byte_literals
-; DARWIN: _G4:
+; DARWIN: L_G4:
 ; DARWIN:     .long 34
 
 ; DARWIN-STATIC: .section        __TEXT,__literal16,16byte_literals
-; DARWIN-STATIC: _G4:
+; DARWIN-STATIC: L_G4:
 ; DARWIN-STATIC:     .long 34
 
 ; DARWIN64: .section        __TEXT,__literal16,16byte_literals
-; DARWIN64: _G4:
+; DARWIN64: L_G4:
 ; DARWIN64:     .quad 34
 
-
 ; int G5 = 47;
 @G5 = global i32 47
 
@@ -194,3 +193,23 @@ define void @F1() {
 ; WIN32-SECTIONS: L_G14:
 ; WIN32-SECTIONS:        .asciz  "foo"
 
+; cannot be merged on MachO, but can on other formats.
+@G15 = unnamed_addr constant i64 0
+
+; LINUX: .section        .rodata.cst8,"aM",@progbits,8
+; LINUX: G15:
+
+; DARWIN: .section      __TEXT,__const
+; DARWIN: _G15:
+
+; DARWIN-STATIC: .section       __TEXT,__const
+; DARWIN-STATIC: _G15:
+
+; DARWIN64: .section       __TEXT,__const
+; DARWIN64: _G15:
+
+; LINUX-SECTIONS: .section      .rodata.G15,"aM",@progbits,8
+; LINUX-SECTIONS: G15:
+
+; WIN32-SECTIONS: .section      .rdata,"rd",one_only,_G15
+; WIN32-SECTIONS: _G15:
diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll
new file mode 100644
index 0000000..1dcf939
--- /dev/null
+++ b/test/CodeGen/X86/half.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C
+
+define void @test_load_store(half* %in, half* %out) {
+; CHECK-LABEL: test_load_store:
+; CHECK: movw (%rdi), [[TMP:%[a-z0-9]+]]
+; CHECK: movw [[TMP]], (%rsi)
+  %val = load half* %in
+  store half %val, half* %out
+  ret void
+}
+
+define i16 @test_bitcast_from_half(half* %addr) {
+; CHECK-LABEL: test_bitcast_from_half:
+; CHECK: movzwl (%rdi), %eax
+  %val = load half* %addr
+  %val_int = bitcast half %val to i16
+  ret i16 %val_int
+}
+
+define void @test_bitcast_to_half(half* %addr, i16 %in) {
+; CHECK-LABEL: test_bitcast_to_half:
+; CHECK: movw %si, (%rdi)
+  %val_fp = bitcast i16 %in to half
+  store half %val_fp, half* %addr
+  ret void
+}
+
+define float @test_extend32(half* %addr) {
+; CHECK-LABEL: test_extend32:
+
+; CHECK-LIBCALL: jmp __gnu_h2f_ieee
+; CHECK-FP16: vcvtph2ps
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to float
+  ret float %val32
+}
+
+define double @test_extend64(half* %addr) {
+; CHECK-LABEL: test_extend64:
+
+; CHECK-LIBCALL: callq __gnu_h2f_ieee
+; CHECK-LIBCALL: cvtss2sd
+; CHECK-FP16: vcvtph2ps
+; CHECK-FP16: vcvtss2sd
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to double
+  ret double %val32
+}
+
+define void @test_trunc32(float %in, half* %addr) {
+; CHECK-LABEL: test_trunc32:
+
+; CHECK-LIBCALL: callq __gnu_f2h_ieee
+; CHECK-FP16: vcvtps2ph
+  %val16 = fptrunc float %in to half
+  store half %val16, half* %addr
+  ret void
+}
+
+define void @test_trunc64(double %in, half* %addr) {
+; CHECK-LABEL: test_trunc64:
+
+; CHECK-LIBCALL: callq __truncdfhf2
+; CHECK-FP16: callq __truncdfhf2
+  %val16 = fptrunc double %in to half
+  store half %val16, half* %addr
+  ret void
+}
diff --git a/test/CodeGen/X86/i8-umulo.ll b/test/CodeGen/X86/i8-umulo.ll
deleted file mode 100644
index ba846f3..0000000
--- a/test/CodeGen/X86/i8-umulo.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc -mcpu=generic -march=x86 < %s | FileCheck %s
-; PR19858
-
-declare {i8, i1} @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
-define i8 @testumulo(i32 %argc) {
-; CHECK: imulw
-; CHECK: testb %{{.+}}, %{{.+}}
-; CHECK: je [[NOOVERFLOWLABEL:.+]]
-; CHECK: {{.*}}[[NOOVERFLOWLABEL]]:
-; CHECK-NEXT: movb
-; CHECK-NEXT: retl
-top:
-  %RHS = trunc i32 %argc to i8
-  %umul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 25, i8 %RHS)
-  %ex = extractvalue { i8, i1 } %umul, 1
-  br i1 %ex, label %overflow, label %nooverlow
-
-overflow:
-  ret i8 %RHS
-
-nooverlow:
-  %umul.value = extractvalue { i8, i1 } %umul, 0
-  ret i8 %umul.value
-}
diff --git a/test/CodeGen/X86/inalloca-regparm.ll b/test/CodeGen/X86/inalloca-regparm.ll
new file mode 100644
index 0000000..9dd916b
--- /dev/null
+++ b/test/CodeGen/X86/inalloca-regparm.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=i686-windows-msvc < %s -o /dev/null
+; RUN: not llc -mtriple=x86_64-windows-msvc %s -o /dev/null 2>&1 | FileCheck %s
+
+; This will compile successfully on x86 but not x86_64, because %b will become a
+; register parameter.
+
+declare x86_thiscallcc i32 @f(i32 %a, i32* inalloca %b)
+define void @g() {
+  %b = alloca inalloca i32
+  store i32 2, i32* %b
+  call x86_thiscallcc i32 @f(i32 0, i32* inalloca %b)
+  ret void
+}
+
+; CHECK: cannot use inalloca attribute on a register parameter
diff --git a/test/CodeGen/X86/inline-asm-fpstack.ll b/test/CodeGen/X86/inline-asm-fpstack.ll
index 91c477b..bb3778a 100644
--- a/test/CodeGen/X86/inline-asm-fpstack.ll
+++ b/test/CodeGen/X86/inline-asm-fpstack.ll
@@ -340,3 +340,65 @@ entry:
   %0 = tail call i32 asm "fcomi $2, $1; pushf; pop $0", "=r,{st},{st(1)},~{dirflag},~{fpsr},~{flags}"(double 2.000000e+00, double 2.000000e+00) nounwind
   ret i32 %0
 }
+
+; <rdar://problem/16952634>
+; X87 stackifier asserted when there was an ST register defined by an
+; inline-asm instruction and the ST register was live across another
+; inline-asm instruction.
+;
+; INLINEASM <es:frndint> [sideeffect] [attdialect], $0:[regdef], %ST0<imp-def,tied5>, $1:[reguse tiedto:$0], %ST0<tied3>, $2:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
+; INLINEASM <es:fldcw $0> [sideeffect] [mayload] [attdialect], $0:[mem], %EAX<undef>, 1, %noreg, 0, %noreg, $1:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
+; %FP0<def> = COPY %ST0
+
+; CHECK-LABEL: _test_live_st
+; CHECK: ## InlineAsm Start
+; CHECK: frndint
+; CHECK: ## InlineAsm End
+; CHECK: ## InlineAsm Start
+; CHECK: fldcw
+; CHECK: ## InlineAsm End
+
+%struct.fpu_t = type { [8 x x86_fp80], x86_fp80, %struct.anon1, %struct.anon2, i32, i8, [15 x i8] }
+%struct.anon1 = type { i32, i32, i32 }
+%struct.anon2 = type { i32, i32, i32, i32 }
+
+@fpu = external global %struct.fpu_t, align 16
+
+; Function Attrs: ssp
+define void @test_live_st(i32 %a1) {
+entry:
+  %0 = load x86_fp80* undef, align 16
+  %cond = icmp eq i32 %a1, 1
+  br i1 %cond, label %sw.bb4.i, label %_Z5tointRKe.exit
+
+sw.bb4.i:
+  %1 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %0)
+  call void asm sideeffect "fldcw $0", "*m,~{dirflag},~{fpsr},~{flags}"(i32* undef)
+  br label %_Z5tointRKe.exit
+
+_Z5tointRKe.exit:
+  %result.0.i = phi x86_fp80 [ %1, %sw.bb4.i ], [ %0, %entry ]
+  %conv.i1814 = fptosi x86_fp80 %result.0.i to i32
+  %conv626 = sitofp i32 %conv.i1814 to x86_fp80
+  store x86_fp80 %conv626, x86_fp80* getelementptr inbounds (%struct.fpu_t* @fpu, i32 0, i32 1)
+  br label %return
+
+return:
+  ret void
+}
+
+; Check that x87 stackifier is correctly rewriting FP registers to ST registers.
+;
+; CHECK-LABEL: _test_operand_rewrite
+; CHECK: ## InlineAsm Start
+; CHECK: foo %st(0), %st(1)
+; CHECK: ## InlineAsm End
+
+define double @test_operand_rewrite() {
+entry:
+  %0 = tail call { double, double } asm sideeffect "foo $0, $1", "={st},={st(1)},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { double, double } %0, 0
+  %asmresult1 = extractvalue { double, double } %0, 1
+  %sub = fsub double %asmresult, %asmresult1
+  ret double %sub
+}
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index d417453..dfa8aed 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -284,7 +284,7 @@ entry:
 define i32 @func_test1(i32 %p1) nounwind uwtable {
 entry:
 ; CHECK-LABEL: func_test1:
-; CHECK: testb
+; CHECK: andb
 ; CHECK: j
 ; CHECK: ret
   %0 = load i32* @b, align 4
diff --git a/test/CodeGen/X86/jump_table_alias.ll b/test/CodeGen/X86/jump_table_alias.ll
index f3691fd..2062200 100644
--- a/test/CodeGen/X86/jump_table_alias.ll
+++ b/test/CodeGen/X86/jump_table_alias.ll
@@ -5,7 +5,7 @@ entry:
   ret i32 0
 }
 
-@i = alias internal i32 ()* @f
+@i = internal alias i32 ()* @f
 @j = alias i32 ()* @f
 
 define i32 @main(i32 %argc, i8** %argv) {
@@ -25,7 +25,6 @@ define i32 @main(i32 %argc, i8** %argv) {
 ; There should only be one table, even though there are two GlobalAliases,
 ; because they both alias the same value.
 
-; CHECK:         .globl  __llvm_jump_instr_table_0_1
 ; CHECK:         .align  8, 0x90
 ; CHECK:         .type   __llvm_jump_instr_table_0_1,@function
 ; CHECK: __llvm_jump_instr_table_0_1:
diff --git a/test/CodeGen/X86/jump_table_align.ll b/test/CodeGen/X86/jump_table_align.ll
new file mode 100644
index 0000000..6ad48d1
--- /dev/null
+++ b/test/CodeGen/X86/jump_table_align.ll
@@ -0,0 +1,29 @@
+; RUN: llc -filetype=obj <%s -jump-table-type=single -o %t1
+; RUN: llvm-objdump -triple=x86_64-unknown-linux-gnu -d %t1 | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @f() unnamed_addr jumptable {
+  ret i32 0
+}
+
+define i32 @g(i8* %a) unnamed_addr jumptable {
+  ret i32 0
+}
+
+define void @h(void ()* %func) unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @main() {
+  %g = alloca i32 (...)*, align 8
+  store i32 (...)* bitcast (i32 ()* @f to i32 (...)*), i32 (...)** %g, align 8
+  %1 = load i32 (...)** %g, align 8
+  %call = call i32 (...)* %1()
+  call void (void ()*)* @h(void ()* bitcast (void (void ()*)* @h to void ()*))
+  %a = call i32 (i32*)* bitcast (i32 (i8*)* @g to i32(i32*)*)(i32* null)
+  ret i32 %a
+}
+
+; Make sure that the padding from getJumpInstrTableEntryBound is right.
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK-NEXT: e9 00 00 00 00                                  jmp     0
+; CHECK-NEXT: 0f 1f 00                                        nopl    (%rax)
diff --git a/test/CodeGen/X86/jump_table_bitcast.ll b/test/CodeGen/X86/jump_table_bitcast.ll
index 33a798f..749b77a 100644
--- a/test/CodeGen/X86/jump_table_bitcast.ll
+++ b/test/CodeGen/X86/jump_table_bitcast.ll
@@ -15,12 +15,12 @@ define void @h(void ()* %func) unnamed_addr jumptable {
 define i32 @main() {
   %g = alloca i32 (...)*, align 8
   store i32 (...)* bitcast (i32 ()* @f to i32 (...)*), i32 (...)** %g, align 8
-; CHECK: movq    $__llvm_jump_instr_table_0_[[ENTRY:1|2|3]], (%rsp)
-; CHECK: movl    $__llvm_jump_instr_table_0_[[ENTRY]], %ecx
+; CHECK: movq    $__llvm_jump_instr_table_0_[[ENTRY:1|2|3]],
+; CHECK: movl    $__llvm_jump_instr_table_0_[[ENTRY]],
   %1 = load i32 (...)** %g, align 8
   %call = call i32 (...)* %1()
   call void (void ()*)* @h(void ()* bitcast (void (void ()*)* @h to void ()*))
-; CHECK: movl    $__llvm_jump_instr_table_0_{{1|2|3}}, %edi
+; CHECK: movl    $__llvm_jump_instr_table_0_{{1|2|3}},
 ; CHECK: callq   h
 
   %a = call i32 (i32*)* bitcast (i32 (i8*)* @g to i32(i32*)*)(i32* null)
@@ -28,17 +28,14 @@ define i32 @main() {
   ret i32 %a
 }
 
-; CHECK:         .globl  __llvm_jump_instr_table_0_1
 ; CHECK:         .align  8, 0x90
 ; CHECK:         .type   __llvm_jump_instr_table_0_1,@function
 ; CHECK: __llvm_jump_instr_table_0_1:
 ; CHECK:         jmp     {{f|g|h}}@PLT
-; CHECK:         .globl  __llvm_jump_instr_table_0_2
 ; CHECK:         .align  8, 0x90
 ; CHECK:         .type   __llvm_jump_instr_table_0_2,@function
 ; CHECK: __llvm_jump_instr_table_0_2:
 ; CHECK:         jmp     {{f|g|h}}@PLT
-; CHECK:         .globl  __llvm_jump_instr_table_0_3
 ; CHECK:         .align  8, 0x90
 ; CHECK:         .type   __llvm_jump_instr_table_0_3,@function
 ; CHECK: __llvm_jump_instr_table_0_3:
diff --git a/test/CodeGen/X86/jump_tables.ll b/test/CodeGen/X86/jump_tables.ll
index 5a0aed0..485154e 100644
--- a/test/CodeGen/X86/jump_tables.ll
+++ b/test/CodeGen/X86/jump_tables.ll
@@ -7,6 +7,20 @@ target triple = "x86_64-unknown-linux-gnu"
 
 %struct.fun_struct = type { i32 (...)* }
 
+@a = global [12 x i32 () *] [ i32 ()* bitcast (void ()* @indirect_fun to i32 ()*),
+			      i32 ()* bitcast (void ()* @indirect_fun_match to i32 ()*),
+			      i32 ()* bitcast (i32 ()* @indirect_fun_i32 to i32 ()*),
+			      i32 ()* bitcast (i32 (i32)* @indirect_fun_i32_1 to i32 ()*),
+			      i32 ()* bitcast (i32 (i32, i32)* @indirect_fun_i32_2 to i32 ()*),
+			      i32 ()* bitcast (i32* (i32*, i32)* @indirect_fun_i32S_2 to i32 ()*),
+			      i32 ()* bitcast (void (%struct.fun_struct)* @indirect_fun_struct to i32 ()*),
+			      i32 ()* bitcast (void (i32 (...)*, i32)* @indirect_fun_fun to i32 ()*),
+			      i32 ()* bitcast (i32 (i32 (...)*, i32)* @indirect_fun_fun_ret to i32 ()*),
+			      i32 ()* bitcast (void ([19 x i8])* @indirect_fun_array to i32 ()*),
+			      i32 ()* bitcast (void (<3 x i32>)* @indirect_fun_vec to i32 ()*),
+			      i32 ()* bitcast (void (<4 x float>)* @indirect_fun_vec_2 to i32 ()*)
+			    ]
+
 define void @indirect_fun() unnamed_addr jumptable {
   ret void
 }
@@ -74,62 +88,50 @@ define i32 @main(i32 %argc, i8** %argv) {
   ret i32 %a
 }
 
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_1
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_1,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_1:
 ; SINGLE-DAG:         jmp     indirect_fun_array@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_2
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_2,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_2:
 ; SINGLE-DAG:         jmp     indirect_fun_i32_2@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_3
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_3,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_3:
 ; SINGLE-DAG:         jmp     indirect_fun_vec_2@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_4
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_4,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_4:
 ; SINGLE-DAG:         jmp     indirect_fun_i32S_2@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_5
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_5,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_5:
 ; SINGLE-DAG:         jmp     indirect_fun_struct@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_6
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_6,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_6:
 ; SINGLE-DAG:         jmp     indirect_fun_i32_1@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_7
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_7,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_7:
 ; SINGLE-DAG:         jmp     indirect_fun_i32@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_8
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_8,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_8:
 ; SINGLE-DAG:         jmp     indirect_fun_fun@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_9
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_9,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_9:
 ; SINGLE-DAG:         jmp     indirect_fun_fun_ret@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_10
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_10,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_10:
 ; SINGLE-DAG:         jmp     indirect_fun@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_11
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_11,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_11:
 ; SINGLE-DAG:         jmp     indirect_fun_match@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_12
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_12,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_12:
@@ -144,82 +146,69 @@ define i32 @main(i32 %argc, i8** %argv) {
 ; SINGLE-DAG:         ud2
 
 
-; ARITY-DAG:         .globl  __llvm_jump_instr_table_2_1
 ; ARITY-DAG:         .align  8, 0x90
 ; ARITY-DAG:         .type   __llvm_jump_instr_table_2_1,@function
 ; ARITY-DAG: __llvm_jump_instr_table_2_1:
 ; ARITY-DAG:         jmp     indirect_fun{{.*}}@PLT
 ; ARITY-DAG:         .align  8, 0x90
 ; ARITY-DAG:         ud2
-; ARITY-DAG:         .globl  __llvm_jump_instr_table_0_1
 ; ARITY-DAG:         .align  8, 0x90
 ; ARITY-DAG:         .type   __llvm_jump_instr_table_0_1,@function
 ; ARITY-DAG: __llvm_jump_instr_table_0_1:
 ; ARITY-DAG:         jmp     indirect_fun{{.*}}@PLT
-; ARITY-DAG:         .globl  __llvm_jump_instr_table_1_1
 ; ARITY-DAG:         .align  8, 0x90
 ; ARITY-DAG:         .type   __llvm_jump_instr_table_1_1,@function
 ; ARITY-DAG: __llvm_jump_instr_table_1_1:
 ; ARITY-DAG:         jmp     indirect_fun{{.*}}@PLT
 
-; SIMPL-DAG:         .globl  __llvm_jump_instr_table_2_1
 ; SIMPL-DAG:         .align  8, 0x90
 ; SIMPL-DAG:         .type   __llvm_jump_instr_table_2_1,@function
 ; SIMPL-DAG: __llvm_jump_instr_table_2_1:
 ; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
 ; SIMPL-DAG:         .align  8, 0x90
 ; SIMPL-DAG:         ud2
-; SIMPL-DAG:         .globl  __llvm_jump_instr_table_0_1
 ; SIMPL-DAG:         .align  8, 0x90
 ; SIMPL-DAG:         .type   __llvm_jump_instr_table_0_1,@function
 ; SIMPL-DAG: __llvm_jump_instr_table_0_1:
 ; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
-; SIMPL-DAG:         .globl  __llvm_jump_instr_table_1_1
 ; SIMPL-DAG:         .align  8, 0x90
 ; SIMPL-DAG:         .type   __llvm_jump_instr_table_1_1,@function
 ; SIMPL-DAG: __llvm_jump_instr_table_1_1:
 ; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
-; SIMPL-DAG:         .globl  __llvm_jump_instr_table_3_1
 ; SIMPL-DAG:         .align  8, 0x90
 ; SIMPL-DAG:         .type   __llvm_jump_instr_table_3_1,@function
 ; SIMPL-DAG: __llvm_jump_instr_table_3_1:
 ; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
-; SIMPL-DAG:         .globl  __llvm_jump_instr_table_4_1
 ; SIMPL-DAG:         .align  8, 0x90
 ; SIMPL-DAG:         .type   __llvm_jump_instr_table_4_1,@function
 ; SIMPL-DAG: __llvm_jump_instr_table_4_1:
 ; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
 
 
-; FULL-DAG:        .globl  __llvm_jump_instr_table_10_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_10_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_10_1:
 ; FULL-DAG:        jmp     indirect_fun_i32_1@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_9_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_9_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_9_1:
 ; FULL-DAG:        jmp     indirect_fun_i32_2@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_7_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_7_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_7_1:
 ; FULL-DAG:        jmp     indirect_fun_i32S_2@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_3_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_3_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_3_1:
 ; FULL-DAG:        jmp     indirect_fun_vec_2@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_2_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_2_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_2_1:
@@ -228,42 +217,36 @@ define i32 @main(i32 %argc, i8** %argv) {
 ; FULL-DAG:        ud2
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_8_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_8_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_8_1:
 ; FULL-DAG:        jmp     indirect_fun_i32@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_1_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_1_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_1_1:
 ; FULL-DAG:        jmp     indirect_fun_array@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_0_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_0_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_0_1:
 ; FULL-DAG:        jmp     indirect_fun_vec@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_6_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_6_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_6_1:
 ; FULL-DAG:        jmp     indirect_fun_struct@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_5_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_5_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_5_1:
 ; FULL-DAG:        jmp     indirect_fun_fun@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_4_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_4_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_4_1:
diff --git a/test/CodeGen/X86/lea-2.ll b/test/CodeGen/X86/lea-2.ll
index 82cefb7..6fb3879 100644
--- a/test/CodeGen/X86/lea-2.ll
+++ b/test/CodeGen/X86/lea-2.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=i686-linux -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl -x86-asm-syntax=intel | FileCheck %s
 
 define i32 @test1(i32 %A, i32 %B) {
   %tmp1 = shl i32 %A, 2
diff --git a/test/CodeGen/X86/lea-3.ll b/test/CodeGen/X86/lea-3.ll
index c439ee1..a56403a 100644
--- a/test/CodeGen/X86/lea-3.ll
+++ b/test/CodeGen/X86/lea-3.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
 
 ; CHECK: leaq (,[[A0:%rdi|%rcx]],4), %rax
diff --git a/test/CodeGen/X86/lea-4.ll b/test/CodeGen/X86/lea-4.ll
index cef4726..00c2278 100644
--- a/test/CodeGen/X86/lea-4.ll
+++ b/test/CodeGen/X86/lea-4.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s
+
 
 define zeroext i16 @t1(i32 %on_off) nounwind {
 entry:
diff --git a/test/CodeGen/X86/lea-5.ll b/test/CodeGen/X86/lea-5.ll
new file mode 100644
index 0000000..50d3aaf
--- /dev/null
+++ b/test/CodeGen/X86/lea-5.ll
@@ -0,0 +1,59 @@
+; test for more complicated forms of lea operands which can be generated
+; in loop optimized cases.
+; See also http://llvm.org/bugs/show_bug.cgi?id=20016
+
+; RUN: llc < %s -mtriple=x86_64-linux -O2        | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -O2 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-nacl -O2 | FileCheck %s -check-prefix=X32
+
+; Function Attrs: nounwind readnone uwtable
+define void @foo(i32 %x, i32 %d) #0 {
+entry:
+  %a = alloca [8 x i32], align 16
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %d.addr.0 = phi i32 [ %d, %entry ], [ %inc, %while.cond ]
+  %arrayidx = getelementptr inbounds [8 x i32]* %a, i32 0, i32 %d.addr.0
+
+; CHECK: leaq	-40(%rsp,%r{{[^,]*}},4), %rax
+; X32:   leal	-40(%rsp,%r{{[^,]*}},4), %eax
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp eq i32 %0, 0
+  %inc = add nsw i32 %d.addr.0, 1
+
+; CHECK: leaq	4(%r{{[^,]*}}), %r{{[^,]*}}
+; X32:   leal	4(%r{{[^,]*}}), %e{{[^,]*}}
+  br i1 %cmp1, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+; The same test as above but with enforsed stack realignment (%a aligned by 64)
+; to check one more case of correct lea generation.
+
+; Function Attrs: nounwind readnone uwtable
+define void @bar(i32 %x, i32 %d) #0 {
+entry:
+  %a = alloca [8 x i32], align 64
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %d.addr.0 = phi i32 [ %d, %entry ], [ %inc, %while.cond ]
+  %arrayidx = getelementptr inbounds [8 x i32]* %a, i32 0, i32 %d.addr.0
+
+; CHECK: leaq	(%rsp,%r{{[^,]*}},4), %rax
+; X32:   leal	(%rsp,%r{{[^,]*}},4), %eax
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp eq i32 %0, 0
+  %inc = add nsw i32 %d.addr.0, 1
+
+; CHECK: leaq	4(%r{{[^,]*}}), %r{{[^,]*}}
+; X32:   leal	4(%r{{[^,]*}}), %e{{[^,]*}}
+  br i1 %cmp1, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
diff --git a/test/CodeGen/X86/lea.ll b/test/CodeGen/X86/lea.ll
index 93cfe46..9b6632c 100644
--- a/test/CodeGen/X86/lea.ll
+++ b/test/CodeGen/X86/lea.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s
 
 define i32 @test1(i32 %x) nounwind {
         %tmp1 = shl i32 %x, 3
diff --git a/test/CodeGen/X86/long-extend.ll b/test/CodeGen/X86/long-extend.ll
deleted file mode 100644
index 5bbd41d..0000000
--- a/test/CodeGen/X86/long-extend.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc < %s -mcpu=core-avx-i -mtriple=x86_64-linux -asm-verbose=0| FileCheck %s
-define void @test_long_extend(<16 x i8> %a, <16 x i32>* %p) nounwind {
-; CHECK-LABEL: test_long_extend
-; CHECK: vpunpcklbw	%xmm1, %xmm0, [[REG1:%xmm[0-9]+]]
-; CHECK: vpunpckhwd	%xmm1, [[REG1]], [[REG2:%xmm[0-9]+]]
-; CHECK: vpunpcklwd	%xmm1, [[REG1]], %x[[REG3:mm[0-9]+]]
-; CHECK: vinsertf128	$1, [[REG2]], %y[[REG3]], [[REG_result0:%ymm[0-9]+]]
-; CHECK: vpunpckhbw	%xmm1, %xmm0, [[REG4:%xmm[0-9]+]]
-; CHECK: vpunpckhwd	%xmm1, [[REG4]], [[REG5:%xmm[0-9]+]]
-; CHECK: vpunpcklwd	%xmm1, [[REG4]], %x[[REG6:mm[0-9]+]]
-; CHECK: vinsertf128	$1, [[REG5]], %y[[REG6]], [[REG_result1:%ymm[0-9]+]]
-; CHECK: vmovaps	[[REG_result1]], 32(%rdi)
-; CHECK: vmovaps	[[REG_result0]], (%rdi)
-
-  %tmp = zext <16 x i8> %a to <16 x i32>
-  store <16 x i32> %tmp, <16 x i32>*%p
-  ret void
-}
diff --git a/test/CodeGen/X86/loop-strength-reduce8.ll b/test/CodeGen/X86/loop-strength-reduce8.ll
index 1d04276..c36047c 100644
--- a/test/CodeGen/X86/loop-strength-reduce8.ll
+++ b/test/CodeGen/X86/loop-strength-reduce8.ll
@@ -1,6 +1,9 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
 
-; CHECK: leal 16(%eax), %edx
+; FIXME: The first two instructions, movl and addl, should have been combined to
+; "leal 16(%eax), %edx" by the backend (PR20776).
+; CHECK: movl    %eax, %edx
+; CHECK: addl    $16, %edx
 ; CHECK: align
 ; CHECK: addl    $4, %edx
 ; CHECK: decl    %ecx
diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll
index f47161e..edb8433e 100644
--- a/test/CodeGen/X86/lower-bitcast.ll
+++ b/test/CodeGen/X86/lower-bitcast.ll
@@ -68,13 +68,13 @@ define i64 @test4(i64 %A) {
   %2 = bitcast <2 x i32> %add to i64
   ret i64 %2
 }
-; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd.
+; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd.
 ; Ideally, we should fold that sequence into a single paddd. This is fixed with
 ; the widening legalization.
 ;
 ; CHECK-LABEL: test4
 ; CHECK: pshufd
-; CHECK-NEXT: paddq
+; CHECK-NEXT: paddd
 ; CHECK-NEXT: pshufd
 ; CHECK: ret
 ;
diff --git a/test/CodeGen/X86/mem-intrin-base-reg.ll b/test/CodeGen/X86/mem-intrin-base-reg.ll
new file mode 100644
index 0000000..dd7f396
--- /dev/null
+++ b/test/CodeGen/X86/mem-intrin-base-reg.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=i686-windows -mattr=+sse2 < %s | FileCheck %s
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+; There is a conflict between lowering the X86 memory intrinsics and the "base"
+; register used to address stack locals.  See X86RegisterInfo::hasBaseRegister
+; for when this is necessary. Typically, we chose ESI for the base register,
+; which all of the X86 string instructions use.
+
+; The pattern of vector icmp and extractelement is used in these tests because
+; it forces creation of an aligned stack temporary. Perhaps such temporaries
+; shouldn't be aligned.
+
+declare void @escape_vla_and_icmp(i8*, i1 zeroext)
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1)
+
+define i32 @memcpy_novla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false)
+  br i1 %cond, label %spill_vectors, label %no_vectors
+
+no_vectors:
+  ret i32 0
+
+spill_vectors:
+  %vp1 = getelementptr <4 x i32>* %vp0, i32 1
+  %v0 = load <4 x i32>* %vp0
+  %v1 = load <4 x i32>* %vp1
+  %vicmp = icmp slt <4 x i32> %v0, %v1
+  %icmp = extractelement <4 x i1> %vicmp, i32 0
+  call void @escape_vla_and_icmp(i8* null, i1 zeroext %icmp)
+  %r = extractelement <4 x i32> %v0, i32 0
+  ret i32 %r
+}
+
+; CHECK-LABEL: _memcpy_novla_vector:
+; CHECK: andl $-16, %esp
+; CHECK-DAG: movl $32, %ecx
+; CHECK-DAG: movl {{.*}}, %esi
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK: rep;movsl
+
+define i32 @memcpy_vla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false)
+  br i1 %cond, label %spill_vectors, label %no_vectors
+
+no_vectors:
+  ret i32 0
+
+spill_vectors:
+  %vp1 = getelementptr <4 x i32>* %vp0, i32 1
+  %v0 = load <4 x i32>* %vp0
+  %v1 = load <4 x i32>* %vp1
+  %vicmp = icmp slt <4 x i32> %v0, %v1
+  %icmp = extractelement <4 x i1> %vicmp, i32 0
+  %vla = alloca i8, i32 %n
+  call void @escape_vla_and_icmp(i8* %vla, i1 zeroext %icmp)
+  %r = extractelement <4 x i32> %v0, i32 0
+  ret i32 %r
+}
+
+; CHECK-LABEL: _memcpy_vla_vector:
+; CHECK: andl $-16, %esp
+; CHECK: movl %esp, %esi
+; CHECK: movl $128, {{.*}}(%esp)
+; CHECK: calll _memcpy
+; CHECK: calll __chkstk
+
+; stosd doesn't clobber esi, so we can use it.
+
+define i32 @memset_vla_vector(<4 x i32>* %vp0, i8* %a, i32 %n, i1 zeroext %cond) {
+  call void @llvm.memset.p0i8.i32(i8* %a, i8 42, i32 128, i32 4, i1 false)
+  br i1 %cond, label %spill_vectors, label %no_vectors
+
+no_vectors:
+  ret i32 0
+
+spill_vectors:
+  %vp1 = getelementptr <4 x i32>* %vp0, i32 1
+  %v0 = load <4 x i32>* %vp0
+  %v1 = load <4 x i32>* %vp1
+  %vicmp = icmp slt <4 x i32> %v0, %v1
+  %icmp = extractelement <4 x i1> %vicmp, i32 0
+  %vla = alloca i8, i32 %n
+  call void @escape_vla_and_icmp(i8* %vla, i1 zeroext %icmp)
+  %r = extractelement <4 x i32> %v0, i32 0
+  ret i32 %r
+}
+
+; CHECK-LABEL: _memset_vla_vector:
+; CHECK: andl $-16, %esp
+; CHECK: movl %esp, %esi
+; CHECK-DAG: movl $707406378, %eax        # imm = 0x2A2A2A2A
+; CHECK-DAG: movl $32, %ecx
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK-NOT: movl {{.*}}, %esi
+; CHECK: rep;stosl
+
+; Add a test for memcmp if we ever add a special lowering for it.
diff --git a/test/CodeGen/X86/mem-promote-integers.ll b/test/CodeGen/X86/mem-promote-integers.ll
index 0015df0..ea38b95 100644
--- a/test/CodeGen/X86/mem-promote-integers.ll
+++ b/test/CodeGen/X86/mem-promote-integers.ll
@@ -1,8 +1,8 @@
 ; Test the basic functionality of integer element promotions of different types.
 ; This tests checks passing of arguments, loading and storing to memory and
 ; basic arithmetic.
-; RUN: llc -march=x86 < %s
-; RUN: llc -march=x86-64 < %s
+; RUN: llc -march=x86 < %s > /dev/null
+; RUN: llc -march=x86-64 < %s > /dev/null
 
 define <1 x i8> @test_1xi8(<1 x i8> %x, <1 x i8>* %b) {
   %bb = load <1 x i8>* %b
diff --git a/test/CodeGen/X86/misched-matmul.ll b/test/CodeGen/X86/misched-matmul.ll
index 3ea6512..5454b7c 100644
--- a/test/CodeGen/X86/misched-matmul.ll
+++ b/test/CodeGen/X86/misched-matmul.ll
@@ -10,7 +10,7 @@
 ; more complex cases.
 ;
 ; CHECK: @wrap_mul4
-; CHECK: 22 regalloc - Number of spills inserted
+; CHECK: 23 regalloc - Number of spills inserted
 
 define void @wrap_mul4(double* nocapture %Out, [4 x double]* nocapture %A, [4 x double]* nocapture %B) #0 {
 entry:
diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll
index 71b0723..96c5dbb 100644
--- a/test/CodeGen/X86/movgs.ll
+++ b/test/CodeGen/X86/movgs.ll
@@ -3,40 +3,58 @@
 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X64
 
 define i32 @test1() nounwind readonly {
+; X32-LABEL: test1:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    movl %gs:196, %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test1:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq %gs:320, %rax
+; X64-NEXT:    movl (%rax), %eax
+; X64-NEXT:    retq
 entry:
 	%tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31)		; <i32*> [#uses=1]
 	%tmp1 = load i32* %tmp		; <i32> [#uses=1]
 	ret i32 %tmp1
 }
-; X32-LABEL: test1:
-; X32: 	movl	%gs:196, %eax
-; X32: 	movl	(%eax), %eax
-; X32: 	ret
-
-; X64-LABEL: test1:
-; X64: 	movq	%gs:320, %rax
-; X64: 	movl	(%rax), %eax
-; X64: 	ret
 
 define i64 @test2(void (i8*)* addrspace(256)* %tmp8) nounwind {
+; X32-LABEL: test2:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    calll *%gs:(%eax)
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: test2:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    {{(subq.*%rsp|pushq)}}
+; X64-NEXT:    callq *%gs:(%{{(rcx|rdi)}})
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    {{(addq.*%rsp|popq)}}
+; X64-NEXT:    retq
 entry:
   %tmp9 = load void (i8*)* addrspace(256)* %tmp8, align 8
   tail call void %tmp9(i8* undef) nounwind optsize
   ret i64 0
 }
 
-; rdar://8453210
-; X32-LABEL: test2:
-; X32: movl	{{.*}}(%esp), %eax
-; X32: calll	*%gs:(%eax)
-
-; X64-LABEL: test2:
-; X64: callq	*%gs:([[A0:%rdi|%rcx]])
-
-
-
-
 define <2 x i64> @pmovsxwd_1(i64 addrspace(256)* %p) nounwind readonly {
+; X32-LABEL: pmovsxwd_1:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pmovsxwd %gs:(%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: pmovsxwd_1:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    pmovsxwd %gs:(%{{(rcx|rdi)}}), %xmm0
+; X64-NEXT:    retq
 entry:
   %0 = load i64 addrspace(256)* %p
   %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0
@@ -44,20 +62,26 @@ entry:
   %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone
   %3 = bitcast <4 x i32> %2 to <2 x i64>
   ret <2 x i64> %3
-  
-; X32-LABEL: pmovsxwd_1:
-; X32: 	movl	4(%esp), %eax
-; X32: 	pmovsxwd	%gs:(%eax), %xmm0
-; X32: 	ret
-
-; X64-LABEL: pmovsxwd_1:
-; X64:	pmovsxwd	%gs:([[A0]]), %xmm0
-; X64:	ret
 }
 
 ; The two loads here both look identical to selection DAG, except for their
 ; address spaces.  Make sure they aren't CSE'd.
 define i32 @test_no_cse() nounwind readonly {
+; X32-LABEL: test_no_cse:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    movl %gs:196, %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    movl %fs:196, %ecx
+; X32-NEXT:    addl (%ecx), %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_no_cse:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq %gs:320, %rax
+; X64-NEXT:    movl (%rax), %eax
+; X64-NEXT:    movq %fs:320, %rcx
+; X64-NEXT:    addl (%rcx), %eax
+; X64-NEXT:    retq
 entry:
 	%tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31)		; <i32*> [#uses=1]
 	%tmp1 = load i32* %tmp		; <i32> [#uses=1]
@@ -66,9 +90,5 @@ entry:
 	%tmp4 = add i32 %tmp1, %tmp3
 	ret i32 %tmp4
 }
-; X32-LABEL: test_no_cse:
-; X32: 	movl	%gs:196
-; X32: 	movl	%fs:196
-; X32: 	ret
 
 declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
index 6910515..f0bdbba 100644
--- a/test/CodeGen/X86/ms-inline-asm.ll
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -110,7 +110,7 @@ define i32 @t31() {
 entry:
   %val = alloca i32, align 64
   store i32 -1, i32* %val, align 64
-  call void asm sideeffect inteldialect "mov dword ptr $0, esp", "=*m,~{dirflag},~{fpsr},~{flags}"(i32* %val) #1
+  call void asm sideeffect inteldialect "mov dword ptr $0, esp", "=*m,~{dirflag},~{fpsr},~{flags}"(i32* %val)
   %sp = load i32* %val, align 64
   ret i32 %sp
 ; CHECK-LABEL: t31:
@@ -125,3 +125,12 @@ entry:
 ; CHECK: movl (%esp), %eax
 ; CHECK: ret
 }
+
+declare hidden void @other_func()
+
+define void @naked() #0 {
+  call void asm sideeffect inteldialect "call dword ptr $0", "*m,~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{esp},~{ebp},~{dirflag},~{fpsr},~{flags}"(void()* @other_func)
+  unreachable
+}
+
+attributes #0 = { naked }
diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll
new file mode 100644
index 0000000..1e99c14
--- /dev/null
+++ b/test/CodeGen/X86/musttail-varargs.ll
@@ -0,0 +1,119 @@
+; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX
+; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS
+
+; Test that we actually spill and reload all arguments in the variadic argument
+; pack. Doing a normal call will clobber all argument registers, and we will
+; spill around it. A simple adjustment should not require any XMM spills.
+
+declare void(i8*, ...)* @get_f(i8* %this)
+
+define void @f_thunk(i8* %this, ...) {
+  %fptr = call void(i8*, ...)*(i8*)* @get_f(i8* %this)
+  musttail call void (i8*, ...)* %fptr(i8* %this, ...)
+  ret void
+}
+
+; Save and restore 6 GPRs, 8 XMMs, and AL around the call.
+
+; LINUX-LABEL: f_thunk:
+; LINUX-DAG: movq %rdi, {{.*}}
+; LINUX-DAG: movq %rsi, {{.*}}
+; LINUX-DAG: movq %rdx, {{.*}}
+; LINUX-DAG: movq %rcx, {{.*}}
+; LINUX-DAG: movq %r8, {{.*}}
+; LINUX-DAG: movq %r9, {{.*}}
+; LINUX-DAG: movb %al, {{.*}}
+; LINUX-DAG: movaps %xmm0, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm1, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm2, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm3, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm4, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm5, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm6, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm7, {{[0-9]*}}(%rsp)
+; LINUX: callq get_f
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm0
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm1
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm2
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm3
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm4
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm5
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm6
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm7
+; LINUX-DAG: movq {{.*}}, %rdi
+; LINUX-DAG: movq {{.*}}, %rsi
+; LINUX-DAG: movq {{.*}}, %rdx
+; LINUX-DAG: movq {{.*}}, %rcx
+; LINUX-DAG: movq {{.*}}, %r8
+; LINUX-DAG: movq {{.*}}, %r9
+; LINUX-DAG: movb {{.*}}, %al
+; LINUX: jmpq *{{.*}}  # TAILCALL
+
+; WINDOWS-LABEL: f_thunk:
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS-DAG: movq %rdx, {{.*}}
+; WINDOWS-DAG: movq %rcx, {{.*}}
+; WINDOWS-DAG: movq %r8, {{.*}}
+; WINDOWS-DAG: movq %r9, {{.*}}
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS: callq get_f
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS-DAG: movq {{.*}}, %rdx
+; WINDOWS-DAG: movq {{.*}}, %rcx
+; WINDOWS-DAG: movq {{.*}}, %r8
+; WINDOWS-DAG: movq {{.*}}, %r9
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS: jmpq *{{.*}} # TAILCALL
+
+; This thunk shouldn't require any spills and reloads, assuming the register
+; allocator knows what it's doing.
+
+define void @g_thunk(i8* %fptr_i8, ...) {
+  %fptr = bitcast i8* %fptr_i8 to void (i8*, ...)*
+  musttail call void (i8*, ...)* %fptr(i8* %fptr_i8, ...)
+  ret void
+}
+
+; LINUX-LABEL: g_thunk:
+; LINUX-NOT: movq
+; LINUX: jmpq *%rdi  # TAILCALL
+
+; WINDOWS-LABEL: g_thunk:
+; WINDOWS-NOT: movq
+; WINDOWS: jmpq *%rcx # TAILCALL
+
+; Do a simple multi-exit multi-bb test.
+
+%struct.Foo = type { i1, i8*, i8* }
+
+@g = external global i32
+
+define void @h_thunk(%struct.Foo* %this, ...) {
+  %cond_p = getelementptr %struct.Foo* %this, i32 0, i32 0
+  %cond = load i1* %cond_p
+  br i1 %cond, label %then, label %else
+
+then:
+  %a_p = getelementptr %struct.Foo* %this, i32 0, i32 1
+  %a_i8 = load i8** %a_p
+  %a = bitcast i8* %a_i8 to void (%struct.Foo*, ...)*
+  musttail call void (%struct.Foo*, ...)* %a(%struct.Foo* %this, ...)
+  ret void
+
+else:
+  %b_p = getelementptr %struct.Foo* %this, i32 0, i32 2
+  %b_i8 = load i8** %b_p
+  %b = bitcast i8* %b_i8 to void (%struct.Foo*, ...)*
+  store i32 42, i32* @g
+  musttail call void (%struct.Foo*, ...)* %b(%struct.Foo* %this, ...)
+  ret void
+}
+
+; LINUX-LABEL: h_thunk:
+; LINUX: jne
+; LINUX: jmpq *{{.*}} # TAILCALL
+; LINUX: jmpq *{{.*}} # TAILCALL
+; WINDOWS-LABEL: h_thunk:
+; WINDOWS: jne
+; WINDOWS: jmpq *{{.*}} # TAILCALL
+; WINDOWS: jmpq *{{.*}} # TAILCALL
diff --git a/test/CodeGen/X86/nancvt.ll b/test/CodeGen/X86/nancvt.ll
index 8036710..8a665fa 100644
--- a/test/CodeGen/X86/nancvt.ll
+++ b/test/CodeGen/X86/nancvt.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts | llc > %t
+; RUN: opt < %s -O3 | llc > %t
 ; RUN: grep 2147027116 %t | count 3
 ; RUN: grep 2147228864 %t | count 3
 ; RUN: grep 2146502828 %t | count 3
diff --git a/test/CodeGen/X86/narrow-shl-load.ll b/test/CodeGen/X86/narrow-shl-load.ll
index 30387925..5175bfc 100644
--- a/test/CodeGen/X86/narrow-shl-load.ll
+++ b/test/CodeGen/X86/narrow-shl-load.ll
@@ -30,40 +30,6 @@ while.end:                                        ; preds = %while.cond
   ret void
 }
 
-
-; DAGCombiner shouldn't fold the sdiv (ashr) away.
-; rdar://8636812
-; CHECK-LABEL: test2:
-; CHECK:   sarl
-
-define i32 @test2() nounwind {
-entry:
-  %i = alloca i32, align 4
-  %j = alloca i8, align 1
-  store i32 127, i32* %i, align 4
-  store i8 0, i8* %j, align 1
-  %tmp3 = load i32* %i, align 4
-  %mul = mul nsw i32 %tmp3, 2
-  %conv4 = trunc i32 %mul to i8
-  %conv5 = sext i8 %conv4 to i32
-  %div6 = sdiv i32 %conv5, 2
-  %conv7 = trunc i32 %div6 to i8
-  %conv9 = sext i8 %conv7 to i32
-  %cmp = icmp eq i32 %conv9, -1
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  ret i32 0
-
-if.end:                                           ; preds = %entry
-  call void @abort() noreturn
-  unreachable
-}
-
-declare void @abort() noreturn
-
-declare void @exit(i32) noreturn
-
 ; DAG Combiner can't fold this into a load of the 1'th byte.
 ; PR8757
 define i32 @test3(i32 *%P) nounwind ssp {
diff --git a/test/CodeGen/X86/nonconst-static-ev.ll b/test/CodeGen/X86/nonconst-static-ev.ll
index f852cae..5449791 100644
--- a/test/CodeGen/X86/nonconst-static-ev.ll
+++ b/test/CodeGen/X86/nonconst-static-ev.ll
@@ -1,6 +1,5 @@
 ; RUN: not llc -march=x86 -mtriple=x86_64-linux-gnu < %s 2> %t
 ; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-; REQUIRES: shell
 
 @0 = global i8 extractvalue ([1 x i8] select (i1 ptrtoint (i32* @1 to i1), [1 x i8] [ i8 1 ], [1 x i8] [ i8 2 ]), 0)
 @1 = external global i32
diff --git a/test/CodeGen/X86/nonconst-static-iv.ll b/test/CodeGen/X86/nonconst-static-iv.ll
index 8fad39b..30613ef 100644
--- a/test/CodeGen/X86/nonconst-static-iv.ll
+++ b/test/CodeGen/X86/nonconst-static-iv.ll
@@ -1,6 +1,5 @@
 ; RUN: not llc -march=x86 -mtriple=x86_64-linux-gnu < %s 2> %t
 ; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-; REQUIRES: shell
 
 @0 = global i8 insertvalue( { i8 } select (i1 ptrtoint (i32* @1 to i1), { i8 } { i8 1 }, { i8 } { i8 2 }), i8 0, 0)
 @1 = external global i32
diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll
new file mode 100644
index 0000000..9d0cb9a
--- /dev/null
+++ b/test/CodeGen/X86/nontemporal-2.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+
+
+; Make sure that we generate non-temporal stores for the test cases below.
+
+define void @test1(<4 x float>* %dst) {
+; CHECK-LABEL: test1:
+; SSE: movntps
+; AVX: vmovntps
+  store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test2(<4 x i32>* %dst) {
+; CHECK-LABEL: test2:
+; SSE: movntps
+; AVX: vmovntps
+  store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test3(<2 x double>* %dst) {
+; CHECK-LABEL: test3:
+; SSE: movntps
+; AVX: vmovntps
+  store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+!1 = metadata !{i32 1}
diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll
index fa77fcb..b559729 100644
--- a/test/CodeGen/X86/null-streamer.ll
+++ b/test/CodeGen/X86/null-streamer.ll
@@ -14,16 +14,16 @@ define void @f1() {
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !13}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !" ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !""}
+!0 = metadata !{metadata !"0x11\004\00 \001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"", metadata !"", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* null, null, null, metadata !2, i32 2}
-!5 = metadata !{i32 786473, metadata !1}
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null}
+!4 = metadata !{metadata !"0x2e\00\00\00\002\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, i32 ()* null, null, null, metadata !2} ; [ DW_TAG_subprogram ]
+!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!8 = metadata !{metadata !"0x24\00\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"_ZL1i", metadata !5, i32 1, metadata !8, i32 1, i32 1, null, null}
+!10 = metadata !{metadata !"0x34\00i\00i\00_ZL1i\001\001\001", null, metadata !5, metadata !8, null, null} ; [ DW_TAG_variable ]
 !11 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/object-size.ll b/test/CodeGen/X86/object-size.ll
index ec35d29..0610f0b 100644
--- a/test/CodeGen/X86/object-size.ll
+++ b/test/CodeGen/X86/object-size.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 < %s -march=x86-64 | FileCheck %s -check-prefix=X64
+; RUN: llc -O0 < %s -march=x86-64 | FileCheck %s
 
 ; ModuleID = 'ts.c'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -12,8 +12,8 @@ entry:
   %tmp = load i8** @p                             ; <i8*> [#uses=1]
   %0 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp, i1 0) ; <i64> [#uses=1]
   %cmp = icmp ne i64 %0, -1                       ; <i1> [#uses=1]
-; X64: movabsq $-1, [[RAX:%r..]]
-; X64: cmpq    $-1, [[RAX]]
+; CHECK: movq $-1, [[RAX:%r..]]
+; CHECK: cmpq $-1, [[RAX]]
   br i1 %cmp, label %cond.true, label %cond.false
 
 cond.true:                                        ; preds = %entry
diff --git a/test/CodeGen/X86/osx-private-labels.ll b/test/CodeGen/X86/osx-private-labels.ll
index 349ce7d..e30cb48 100644
--- a/test/CodeGen/X86/osx-private-labels.ll
+++ b/test/CodeGen/X86/osx-private-labels.ll
@@ -69,3 +69,20 @@
 ; CHECK: .section	__DATA,__foobar,interposing
 ; CHECK-NEXT: .align	3
 ; CHECK-NEXT: L_private12:
+
+@private13 = private global i32 42, section "__DATA, __objc_classlist, regular, no_dead_strip"
+; CHECK: .section	__DATA,__objc_classlist,regular,no_dead_strip
+; CHECK-NEXT: .align	2
+; CHECK-NEXT: L_private13:
+
+@private14 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_classname,cstring_literals"
+; CHECK: .section	__TEXT,__objc_classname,cstring_literals
+; CHECK-NEXT: L_private14:
+
+@private15 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_methname,cstring_literals"
+; CHECK: .section	__TEXT,__objc_methname,cstring_literals
+; CHECK-NEXT: L_private15:
+
+@private16 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_methtype,cstring_literals"
+; CHECK: .section	__TEXT,__objc_methtype,cstring_literals
+; CHECK-NEXT: L_private16:
diff --git a/test/CodeGen/X86/palignr.ll b/test/CodeGen/X86/palignr.ll
index ec6564d..3efcc2e 100644
--- a/test/CodeGen/X86/palignr.ll
+++ b/test/CodeGen/X86/palignr.ll
@@ -3,58 +3,127 @@
 
 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK-LABEL: test1:
-; CHECK: pshufd
-; CHECK-YONAH: pshufd
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test1:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <4 x i32> %A, <4 x i32> undef, <4 x i32> < i32 1, i32 2, i32 3, i32 0 >
 	ret <4 x i32> %C
 }
 
 define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK-LABEL: test2:
-; CHECK: palignr
-; CHECK-YONAH: shufps
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test2:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; CHECK-YONAH-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 3, i32 4 >
 	ret <4 x i32> %C
 }
 
 define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK-LABEL: test3:
-; CHECK: palignr
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test3:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 undef, i32 4 >
 	ret <4 x i32> %C
 }
 
 define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK-LABEL: test4:
-; CHECK: palignr
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test4:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; CHECK-YONAH-NEXT:    movapd %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 >
 	ret <4 x i32> %C
 }
 
 define <4 x float> @test5(<4 x float> %A, <4 x float> %B) nounwind {
 ; CHECK-LABEL: test5:
-; CHECK: palignr
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; CHECK-NEXT:    movapd %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test5:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; CHECK-YONAH-NEXT:    movapd %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 >
 	ret <4 x float> %C
 }
 
 define <8 x i16> @test6(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; CHECK-LABEL: test6:
-; CHECK: palignr
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test6:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; CHECK-YONAH-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
+; CHECK-YONAH-NEXT:    por %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 3, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10 >
 	ret <8 x i16> %C
 }
 
 define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; CHECK-LABEL: test7:
-; CHECK: palignr
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test7:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-YONAH-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
+; CHECK-YONAH-NEXT:    por %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 undef, i32 6, i32 undef, i32 8, i32 9, i32 10, i32 11, i32 12 >
 	ret <8 x i16> %C
 }
 
 define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind {
 ; CHECK-LABEL: test8:
-; CHECK: palignr
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm1 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test8:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; CHECK-YONAH-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
+; CHECK-YONAH-NEXT:    por %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <16 x i8> %A, <16 x i8> %B, <16 x i32> < i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20 >
 	ret <16 x i8> %C
 }
@@ -65,8 +134,19 @@ define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind {
 ; was an UNDEF.)
 define <8 x i16> @test9(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; CHECK-LABEL: test9:
-; CHECK-NOT: palignr
-; CHECK: pshufb
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test9:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; CHECK-YONAH-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; CHECK-YONAH-NEXT:    por %xmm0, %xmm1
+; CHECK-YONAH-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <8 x i16> %B, <8 x i16> %A, <8 x i32> < i32 undef, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0 >
 	ret <8 x i16> %C
 }
diff --git a/test/CodeGen/X86/patchpoint-invoke.ll b/test/CodeGen/X86/patchpoint-invoke.ll
new file mode 100644
index 0000000..192cacc
--- /dev/null
+++ b/test/CodeGen/X86/patchpoint-invoke.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=x86_64-unknown-linux -mcpu=corei7                             < %s | FileCheck %s
+
+; Test invoking of patchpoints
+;
+define i64 @patchpoint_invoke(i64 %p1, i64 %p2) {
+entry:
+; CHECK-LABEL: patchpoint_invoke:
+; CHECK-NEXT: .cfi_startproc
+; CHECK:      [[FUNC_BEGIN:.L.*]]:
+; CHECK:      .cfi_lsda 3, [[EXCEPTION_LABEL:.L[^ ]*]]
+; CHECK:      pushq %rbp
+
+; Unfortunately, hardcode the name of the label that begins the patchpoint:
+; CHECK:      .Ltmp0:
+; CHECK:      movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; CHECK-NEXT: xchgw %ax, %ax
+; CHECK-NEXT: [[PP_END:.L.*]]:
+; CHECK:      ret
+  %resolveCall = inttoptr i64 -559038736 to i8*
+  %result = invoke i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall, i32 1, i64 %p1, i64 %p2)
+            to label %success unwind label %threw
+
+success:
+  ret i64 %result
+
+threw:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  ret i64 0
+}
+
+; Verify that the exception table was emitted:
+; CHECK:      [[EXCEPTION_LABEL]]:
+; CHECK-NEXT: .byte 255
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 21
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 13
+; Verify that the unwind data covers the entire patchpoint region:
+; CHECK-NEXT: .long .Ltmp0-[[FUNC_BEGIN]]
+; CHECK-NEXT: .long [[PP_END]]-.Ltmp0
+
+
+; Verify that the stackmap section got emitted:
+; CHECK-LABEL: __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 1
+; Num LargeConstants
+; CHECK-NEXT:   .long 0
+; Num Callsites
+; CHECK-NEXT:   .long 1
+; CHECK-NEXT:   .quad patchpoint_invoke
+
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/CodeGen/X86/patchpoint-webkit_jscc.ll b/test/CodeGen/X86/patchpoint-webkit_jscc.ll
new file mode 100644
index 0000000..5e76bf8
--- /dev/null
+++ b/test/CodeGen/X86/patchpoint-webkit_jscc.ll
@@ -0,0 +1,88 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7                             < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST
+
+; Test the webkit_jscc calling convention.
+; One argument will be passed in register, the other will be pushed on the stack.
+; Return value in $rax.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      movq %r{{.+}}, (%rsp)
+; CHECK:      movq %r{{.+}}, %rax
+; CHECK:      Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; CHECK:      movq %rax, (%rsp)
+; CHECK:      callq
+; FAST-LABEL: jscall_patchpoint_codegen:
+; FAST:       Ltmp
+; FAST:       movq %r{{.+}}, (%rsp)
+; FAST:       movq %r{{.+}}, %rax
+; FAST:       Ltmp
+; FAST-NEXT:  movabsq $-559038736, %r11
+; FAST-NEXT:  callq *%r11
+; FAST:       movq %rax, (%rsp)
+; FAST:       callq
+  %resolveCall2 = inttoptr i64 -559038736 to i8*
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %resolveCall3 = inttoptr i64 -559038737 to i8*
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  ret void
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK:      Ltmp
+; CHECK:      movq $6, 24(%rsp)
+; CHECK-NEXT: movl $4, 16(%rsp)
+; CHECK-NEXT: movq $2, (%rsp)
+; CHECK:      Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; FAST-LABEL: jscall_patchpoint_codegen2:
+; FAST:       Ltmp
+; FAST:       movq $2, (%rsp)
+; FAST-NEXT:  movl $4, 16(%rsp)
+; FAST-NEXT:  movq $6, 24(%rsp)
+; FAST:       Ltmp
+; FAST-NEXT:  movabsq $-559038736, %r11
+; FAST-NEXT:  callq *%r11
+  %call = inttoptr i64 -559038736 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK:      Ltmp
+; CHECK:      movq $10, 48(%rsp)
+; CHECK-NEXT: movl  $8, 36(%rsp)
+; CHECK-NEXT: movq  $6, 24(%rsp)
+; CHECK-NEXT: movl  $4, 16(%rsp)
+; CHECK-NEXT: movq  $2, (%rsp)
+; CHECK:      Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; FAST-LABEL: jscall_patchpoint_codegen3:
+; FAST:       Ltmp
+; FAST:       movq  $2, (%rsp)
+; FAST-NEXT:  movl  $4, 16(%rsp)
+; FAST-NEXT:  movq  $6, 24(%rsp)
+; FAST-NEXT:  movl  $8, 36(%rsp)
+; FAST-NEXT:  movq $10, 48(%rsp)
+; FAST:       Ltmp
+; FAST-NEXT:  movabsq $-559038736, %r11
+; FAST-NEXT:  callq *%r11
+  %call = inttoptr i64 -559038736 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  ret i64 %result
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+
diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll
index 62b1273..07148f0 100644
--- a/test/CodeGen/X86/patchpoint.ll
+++ b/test/CodeGen/X86/patchpoint.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7                             < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort < %s | FileCheck %s
 
 ; Trivial patchpoint codegen
 ;
@@ -38,61 +39,6 @@ entry:
   ret void
 }
 
-; Test the webkit_jscc calling convention.
-; One argument will be passed in register, the other will be pushed on the stack.
-; Return value in $rax.
-define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen:
-; CHECK:      Ltmp
-; CHECK:      movq %r{{.+}}, (%rsp)
-; CHECK:      movq %r{{.+}}, %rax
-; CHECK:      Ltmp
-; CHECK-NEXT: movabsq $-559038736, %r11
-; CHECK-NEXT: callq *%r11
-; CHECK:      movq %rax, (%rsp)
-; CHECK:      callq
-  %resolveCall2 = inttoptr i64 -559038736 to i8*
-  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
-  %resolveCall3 = inttoptr i64 -559038737 to i8*
-  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
-  ret void
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen2(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen2:
-; CHECK:      Ltmp
-; CHECK:      movq $6, 24(%rsp)
-; CHECK-NEXT: movl $4, 16(%rsp)
-; CHECK-NEXT: movq $2, (%rsp)
-; CHECK:      Ltmp
-; CHECK-NEXT: movabsq $-559038736, %r11
-; CHECK-NEXT: callq *%r11
-  %call = inttoptr i64 -559038736 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
-  ret i64 %result
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen3(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen3:
-; CHECK:      Ltmp
-; CHECK:      movq $10, 48(%rsp)
-; CHECK-NEXT: movl  $8, 36(%rsp)
-; CHECK-NEXT: movq  $6, 24(%rsp)
-; CHECK-NEXT: movl  $4, 16(%rsp)
-; CHECK-NEXT: movq  $2, (%rsp)
-; CHECK:      Ltmp
-; CHECK-NEXT: movabsq $-559038736, %r11
-; CHECK-NEXT: callq *%r11
-  %call = inttoptr i64 -559038736 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
-  ret i64 %result
-}
-
 ; Test patchpoints reusing the same TargetConstant.
 ; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
 ; There is no way to verify this, since it depends on memory allocation.
@@ -125,6 +71,17 @@ entry:
   ret void
 }
 
+; Test large target address.
+define i64 @large_target_address_patchpoint_codegen() {
+entry:
+; CHECK-LABEL: large_target_address_patchpoint_codegen:
+; CHECK:      movabsq $6153737369414576827, %r11
+; CHECK-NEXT: callq *%r11
+  %resolveCall2 = inttoptr i64 6153737369414576827 to i8*
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 0)
+  ret i64 %result
+}
+
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/peep-vector-extract-concat.ll b/test/CodeGen/X86/peep-vector-extract-concat.ll
deleted file mode 100644
index f73ebb9..0000000
--- a/test/CodeGen/X86/peep-vector-extract-concat.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2,-sse4.1 | FileCheck %s
-; CHECK: pshufd $3, %xmm0, %xmm0
-
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2,-sse4.1 | FileCheck %s -check-prefix=WIN64
-; %a is passed indirectly on Win64.
-; WIN64: movss   12(%rcx), %xmm0
-
-define float @foo(<8 x float> %a) nounwind {
-  %c = extractelement <8 x float> %a, i32 3
-  ret float %c
-}
diff --git a/test/CodeGen/X86/peep-vector-extract-insert.ll b/test/CodeGen/X86/peep-vector-extract-insert.ll
deleted file mode 100644
index f958b6b..0000000
--- a/test/CodeGen/X86/peep-vector-extract-insert.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=x86-64 | grep "xorps	%xmm0, %xmm0" | count 2
-
-define float @foo(<4 x float> %a) {
-  %b = insertelement <4 x float> %a, float 0.0, i32 3
-  %c = extractelement <4 x float> %b, i32 3
-  ret float %c
-}
-define float @bar(float %a) {
-  %b = insertelement <4 x float> <float 0x400B333340000000, float 4.5, float 0.0, float 0x4022666660000000>, float %a, i32 3
-  %c = extractelement <4 x float> %b, i32 2
-  ret float %c
-}
diff --git a/test/CodeGen/X86/peephole-fold-movsd.ll b/test/CodeGen/X86/peephole-fold-movsd.ll
new file mode 100644
index 0000000..09d9328
--- /dev/null
+++ b/test/CodeGen/X86/peephole-fold-movsd.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+;
+; Check that x86's peephole optimization doesn't fold a 64-bit load (movsd) into
+; addpd.
+; rdar://problem/18236850
+
+%struct.S1 = type { double, double }
+
+@g = common global %struct.S1 zeroinitializer, align 8
+
+declare void @foo3(%struct.S1*)
+
+; CHECK: movsd {{[0-9]*}}(%rsp), [[R0:%xmm[0-9]+]]
+; CHECK: addpd [[R0]], %xmm{{[0-9]+}}
+
+define void @foo1(double %a.coerce0, double %a.coerce1, double %b.coerce0, double %b.coerce1) {
+  %1 = alloca <2 x double>, align 16
+  %tmpcast = bitcast <2 x double>* %1 to %struct.S1*
+  call void @foo3(%struct.S1* %tmpcast) #2
+  %p2 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 0
+  %2 = load double* %p2, align 16
+  %p3 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 1
+  %3 = load double* %p3, align 8
+  %4 = insertelement <2 x double> undef, double %2, i32 0
+  %5 = insertelement <2 x double> %4, double 0.000000e+00, i32 1
+  %6 = insertelement <2 x double> undef, double %3, i32 1
+  %7 = insertelement <2 x double> %6, double 1.000000e+00, i32 0
+  %8 = fadd <2 x double> %5, %7
+  store <2 x double> %8, <2 x double>* bitcast (%struct.S1* @g to <2 x double>*), align 16
+  ret void
+}
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 7bf8a61..8937d6a 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,32 +1,96 @@
-; RUN: llc < %s -march=x86 -mattr=sse4.1 -mcpu=nehalem -stack-alignment=16 > %t
-; RUN: grep pmul %t | count 12
-; RUN: grep mov %t | count 14
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
 
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
-        %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
-        ret <4 x i32> %A
+; SSE2-LABEL: a:
+; SSE2:         movdqa {{.*}}, %[[X1:xmm[0-9]+]]
+; SSE2-NEXT:    pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %[[X1]], %xmm0
+; SSE2-NEXT:    pmuludq %[[X1]], %[[X2]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: a:
+; SSE41:         pmulld
+; SSE41-NEXT:    retq
+entry:
+  %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
+  ret <4 x i32> %A
 }
+
 define <2 x i64> @b(<2 x i64> %i) nounwind  {
-        %A = mul <2 x i64> %i, < i64 117, i64 117 >
-        ret <2 x i64> %A
+; ALL-LABEL: b:
+; ALL:         pmuludq
+; ALL:         pmuludq
+; ALL:         pmuludq
+entry:
+  %A = mul <2 x i64> %i, < i64 117, i64 117 >
+  ret <2 x i64> %A
 }
+
 define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind  {
-        %A = mul <4 x i32> %i, %j
-        ret <4 x i32> %A
+; SSE2-LABEL: c:
+; SSE2:         pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %[[X2]], %xmm1
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: c:
+; SSE41:         pmulld
+; SSE41-NEXT:    retq
+entry:
+  %A = mul <4 x i32> %i, %j
+  ret <4 x i32> %A
 }
+
 define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind  {
-        %A = mul <2 x i64> %i, %j
-        ret <2 x i64> %A
+; ALL-LABEL: d:
+; ALL:         pmuludq
+; ALL:         pmuludq
+; ALL:         pmuludq
+entry:
+  %A = mul <2 x i64> %i, %j
+  ret <2 x i64> %A
 }
-; Use a call to force spills.
+
 declare void @foo()
+
 define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind  {
-        call void @foo()
-        %A = mul <4 x i32> %i, %j
-        ret <4 x i32> %A
+; SSE2-LABEL: e:
+; SSE2:         movdqa {{[0-9]*}}(%rsp), %xmm0
+; SSE2-NEXT:    pshufd {{.*}} # [[X1:xmm[0-9]+]] = xmm0[1,1,3,3]
+; SSE2-NEXT:    movdqa {{[0-9]*}}(%rsp), %[[X2:xmm[0-9]+]]
+; SSE2-NEXT:    pmuludq %[[X2]], %xmm0
+; SSE2-NEXT:    pshufd {{.*}} # [[X2]] = [[X2]][1,1,3,3]
+; SSE2-NEXT:    pmuludq %[[X1]], %[[X2]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    addq ${{[0-9]+}}, %rsp
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: e:
+; SSE41:         pmulld {{[0-9]+}}(%rsp), %xmm
+; SSE41-NEXT:    addq ${{[0-9]+}}, %rsp
+; SSE41-NEXT:    retq
+entry:
+  ; Use a call to force spills.
+  call void @foo()
+  %A = mul <4 x i32> %i, %j
+  ret <4 x i32> %A
 }
+
 define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind  {
-        call void @foo()
-        %A = mul <2 x i64> %i, %j
-        ret <2 x i64> %A
+; ALL-LABEL: f:
+; ALL:         pmuludq
+; ALL:         pmuludq
+; ALL:         pmuludq
+entry:
+  ; Use a call to force spills.
+  call void @foo()
+  %A = mul <2 x i64> %i, %j
+  ret <2 x i64> %A
 }
diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll
index e7e29e0..0bdb0ec 100644
--- a/test/CodeGen/X86/pr11334.ll
+++ b/test/CodeGen/X86/pr11334.ll
@@ -15,7 +15,7 @@ define <3 x double> @v3f2d_ext_vec(<3 x float> %v1) nounwind {
 entry:
 ; CHECK: v3f2d_ext_vec
 ; CHECK: cvtps2pd
-; CHECK: movhlps
+; CHECK: shufpd
 ; CHECK: cvtps2pd
 ; AVX:   v3f2d_ext_vec
 ; AVX:   vcvtps2pd
@@ -28,7 +28,7 @@ define <4 x double> @v4f2d_ext_vec(<4 x float> %v1) nounwind {
 entry:
 ; CHECK: v4f2d_ext_vec
 ; CHECK: cvtps2pd
-; CHECK: movhlps
+; CHECK: shufpd
 ; CHECK: cvtps2pd
 ; AVX:   v4f2d_ext_vec
 ; AVX:   vcvtps2pd
@@ -42,9 +42,9 @@ entry:
 ; CHECK: v8f2d_ext_vec
 ; CHECK: cvtps2pd
 ; CHECK: cvtps2pd
-; CHECK: movhlps
+; CHECK: shufpd
 ; CHECK: cvtps2pd
-; CHECK: movhlps
+; CHECK: shufpd
 ; CHECK: cvtps2pd
 ; AVX:   v8f2d_ext_vec
 ; AVX:   vcvtps2pd
diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll
deleted file mode 100644
index 024b163..0000000
--- a/test/CodeGen/X86/pr12359.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s
-define <16 x i8> @shuf(<16 x i8> %inval1) {
-entry:
-  %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4>
-  ret <16 x i8> %0
-; CHECK: shuf
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: pshufb
-; CHECK-NEXT: ret
-}
diff --git a/test/CodeGen/X86/pr14161.ll b/test/CodeGen/X86/pr14161.ll
index ff4532e..c2bb8d3 100644
--- a/test/CodeGen/X86/pr14161.ll
+++ b/test/CodeGen/X86/pr14161.ll
@@ -3,6 +3,12 @@
 declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>)
 
 define <2 x i16> @good(<4 x i32>*, <4 x i8>*) {
+; CHECK-LABEL: good:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movdqa (%rdi), %xmm0
+; CHECK-NEXT:    pminud {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    pmovzxwq %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %2 = load <4 x i32>* %0, align 16
   %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
@@ -13,13 +19,17 @@ entry:
   %8 = bitcast i32 %4 to <2 x i16>
   %9 = bitcast i32 %5 to <2 x i16>
   ret <2 x i16> %8
-; CHECK: good
-; CHECK: pminud
-; CHECK-NEXT: pmovzxwq
-; CHECK: ret
 }
 
 define <2 x i16> @bad(<4 x i32>*, <4 x i8>*) {
+; CHECK-LABEL: bad:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movdqa (%rdi), %xmm0
+; CHECK-NEXT:    pminud {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    pextrd $1, %xmm0, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    pmovzxwq %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %2 = load <4 x i32>* %0, align 16
   %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
@@ -30,9 +40,4 @@ entry:
   %8 = bitcast i32 %4 to <2 x i16>
   %9 = bitcast i32 %5 to <2 x i16>
   ret <2 x i16> %9
-; CHECK: bad
-; CHECK: pminud
-; CHECK: pextrd
-; CHECK: pmovzxwq
-; CHECK: ret
 }
diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll
index c8aaf32..b4dc5fd 100644
--- a/test/CodeGen/X86/pr15267.ll
+++ b/test/CodeGen/X86/pr15267.ll
@@ -48,19 +48,22 @@ define <4 x i64> @test3(<4 x i1>* %in) nounwind {
 
 ; CHECK: test3
 ; CHECK: movzbl
-; CHECK: shrl
-; CHECK: andl $1
-; CHECK: andl $1
-; CHECK: vmovd
-; CHECK: pinsrd $1
-; CHECK: shrl $2
-; CHECK: andl $1
-; CHECK: pinsrd $2
-; CHECK: shrl $3
-; CHECK: andl $1
-; CHECK: pinsrd $3
-; CHECK: pslld
-; CHECK: psrad
-; CHECK: pmovsxdq
-; CHECK: pmovsxdq
+; CHECK: movq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: movq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: vpunpcklqdq
+; CHECK: movq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: vpunpcklqdq
+; CHECK: vinsertf128
 ; CHECK: ret
diff --git a/test/CodeGen/X86/pr18846.ll b/test/CodeGen/X86/pr18846.ll
new file mode 100644
index 0000000..27801be
--- /dev/null
+++ b/test/CodeGen/X86/pr18846.ll
@@ -0,0 +1,139 @@
+; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; pr18846 - needless avx spill/reload
+; Test for unnecessary repeated spills due to eliminateRedundantSpills failing
+; to recognise unaligned ymm load/stores to the stack.
+; Bugpoint reduced testcase.
+
+;CHECK-LABEL: _Z16opt_kernel_cachePfS_S_
+;CHECK-NOT:   vmovups {{.*#+}} 32-byte Folded Spill
+;CHECK-NOT:   vmovups {{.*#+}} 32-byte Folded Reload
+
+; Function Attrs: uwtable
+define void @_Z16opt_kernel_cachePfS_S_() #0 {
+entry:
+  br label %for.body29
+
+for.body29:                                       ; preds = %for.body29, %entry
+  br i1 undef, label %for.body29, label %for.body65
+
+for.body65:                                       ; preds = %for.body29
+  %0 = load float* undef, align 4, !tbaa !1
+  %vecinit7.i4448 = insertelement <8 x float> undef, float %0, i32 7
+  %1 = load float* null, align 4, !tbaa !1
+  %vecinit7.i4304 = insertelement <8 x float> undef, float %1, i32 7
+  %2 = load float* undef, align 4, !tbaa !1
+  %vecinit7.i4196 = insertelement <8 x float> undef, float %2, i32 7
+  %3 = or i64 0, 16
+  %add.ptr111.sum4096 = add i64 %3, 0
+  %4 = load <8 x float>* null, align 16, !tbaa !5
+  %add.ptr162 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr111.sum4096
+  %__v.i4158 = bitcast float* %add.ptr162 to <8 x float>*
+  %5 = load <8 x float>* %__v.i4158, align 16, !tbaa !5
+  %add.ptr158.sum40975066 = or i64 %add.ptr111.sum4096, 8
+  %add.ptr183 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr158.sum40975066
+  %__v.i4162 = bitcast float* %add.ptr183 to <8 x float>*
+  %6 = load <8 x float>* %__v.i4162, align 16, !tbaa !5
+  %add.ptr200.sum40995067 = or i64 undef, 8
+  %add.ptr225 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr200.sum40995067
+  %__v.i4167 = bitcast float* %add.ptr225 to <8 x float>*
+  %7 = load <8 x float>* %__v.i4167, align 4, !tbaa !5
+  %8 = load <8 x float>* undef, align 16, !tbaa !5
+  %add.ptr242.sum41015068 = or i64 0, 8
+  %add.ptr267 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr242.sum41015068
+  %__v.i4171 = bitcast float* %add.ptr267 to <8 x float>*
+  %9 = load <8 x float>* %__v.i4171, align 4, !tbaa !5
+  %mul.i4690 = fmul <8 x float> %7, undef
+  %add.i4665 = fadd <8 x float> undef, undef
+  %mul.i4616 = fmul <8 x float> %8, undef
+  %mul.i4598 = fmul <8 x float> undef, undef
+  %add.i4597 = fadd <8 x float> undef, %mul.i4598
+  %mul.i4594 = fmul <8 x float> %6, undef
+  %add.i4593 = fadd <8 x float> undef, %mul.i4594
+  %mul.i4578 = fmul <8 x float> %9, undef
+  %add.i4577 = fadd <8 x float> %add.i4593, %mul.i4578
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4577) #1
+  %10 = load <8 x float>* null, align 16, !tbaa !5
+  %11 = load <8 x float>* undef, align 16, !tbaa !5
+  %mul.i4564 = fmul <8 x float> %4, undef
+  %add.i4563 = fadd <8 x float> %10, %mul.i4564
+  %mul.i4560 = fmul <8 x float> %5, undef
+  %add.i4559 = fadd <8 x float> %11, %mul.i4560
+  %add.i4547 = fadd <8 x float> %add.i4563, undef
+  %mul.i4546 = fmul <8 x float> %7, undef
+  %add.i4545 = fadd <8 x float> undef, %mul.i4546
+  %mul.i4544 = fmul <8 x float> %8, undef
+  %add.i4543 = fadd <8 x float> %add.i4559, %mul.i4544
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4547) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4545) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4543) #1
+  %add.i4455 = fadd <8 x float> undef, undef
+  %mul.i4454 = fmul <8 x float> undef, undef
+  %add.i4453 = fadd <8 x float> undef, %mul.i4454
+  %mul.i4440 = fmul <8 x float> zeroinitializer, %vecinit7.i4448
+  %add.i4439 = fadd <8 x float> %add.i4455, %mul.i4440
+  %mul.i4438 = fmul <8 x float> %7, %vecinit7.i4448
+  %add.i4437 = fadd <8 x float> %add.i4453, %mul.i4438
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4439) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4437) #1
+  %add.i4413 = fadd <8 x float> zeroinitializer, undef
+  %mul.i4400 = fmul <8 x float> %8, undef
+  %add.i4399 = fadd <8 x float> undef, %mul.i4400
+  %add.i4397 = fadd <8 x float> %add.i4413, zeroinitializer
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> zeroinitializer) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4399) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4397) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1
+  %mul.i4330 = fmul <8 x float> %7, undef
+  %add.i4329 = fadd <8 x float> undef, %mul.i4330
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4329) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1
+  %mul.i4312 = fmul <8 x float> %4, undef
+  %add.i4311 = fadd <8 x float> undef, %mul.i4312
+  %mul.i4306 = fmul <8 x float> %6, undef
+  %add.i4305 = fadd <8 x float> undef, %mul.i4306
+  %add.i4295 = fadd <8 x float> %add.i4311, undef
+  %mul.i4294 = fmul <8 x float> %7, %vecinit7.i4304
+  %add.i4293 = fadd <8 x float> undef, %mul.i4294
+  %mul.i4292 = fmul <8 x float> %8, %vecinit7.i4304
+  %add.i4291 = fadd <8 x float> undef, %mul.i4292
+  %mul.i4290 = fmul <8 x float> %9, %vecinit7.i4304
+  %add.i4289 = fadd <8 x float> %add.i4305, %mul.i4290
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4295) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4293) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4291) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4289) #1
+  %12 = load <8 x float>* undef, align 16, !tbaa !5
+  %mul.i4274 = fmul <8 x float> undef, undef
+  %add.i4273 = fadd <8 x float> %12, %mul.i4274
+  %mul.i4258 = fmul <8 x float> %7, undef
+  %add.i4257 = fadd <8 x float> %add.i4273, %mul.i4258
+  %mul.i4254 = fmul <8 x float> %9, undef
+  %add.i4253 = fadd <8 x float> undef, %mul.i4254
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4257) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4253) #1
+  %mul.i = fmul <8 x float> %9, %vecinit7.i4196
+  %add.i = fadd <8 x float> undef, %mul.i
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> zeroinitializer) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i) #1
+  unreachable
+}
+
+; Function Attrs: nounwind
+declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) #1
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5 "}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"float", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !3, metadata !3, i64 0}
diff --git a/test/CodeGen/X86/pr21099.ll b/test/CodeGen/X86/pr21099.ll
new file mode 100644
index 0000000..07292c1
--- /dev/null
+++ b/test/CodeGen/X86/pr21099.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -O2 -march=x86-64 -verify-machineinstrs | FileCheck %s
+
+define void @pr21099(i64* %p) {
+; CHECK-LABEL: pr21099
+; CHECK: lock
+; CHECK-NEXT: addq $-2147483648
+; This number is INT32_MIN: 0x80000000UL
+  %1 = atomicrmw add i64* %p, i64 -2147483648 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/X86/pr21529.ll b/test/CodeGen/X86/pr21529.ll
new file mode 100644
index 0000000..655bc84
--- /dev/null
+++ b/test/CodeGen/X86/pr21529.ll
@@ -0,0 +1,15 @@
+; RUN: llc -show-mc-encoding < %s | FileCheck %s
+
+; Test that the direct object emission selects the and variant with 8 bit
+; immediate.
+; We used to get this wrong when using direct object emission, but not when
+; reading assembly.
+
+; CHECK: andq    $-32, %rsp              # encoding: [0x48,0x83,0xe4,0xe0]
+
+target triple = "x86_64-pc-linux"
+
+define void @f() {
+  %foo = alloca i8, align 32
+  ret void
+}
diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
new file mode 100644
index 0000000..7fc9890
--- /dev/null
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -march=x86-64 -mattr=+ssse3 | FileCheck %s
+
+; Test that the pshufb mask comment is correct.
+
+define <16 x i8> @test1(<16 x i8> %V) {
+; CHECK-LABEL: test1:
+; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,0,0,2,0,0,0,0,3,0,0,0,0,4]
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 2, i8 0, i8 0, i8 0, i8 0, i8 3, i8 0, i8 0, i8 0, i8 0, i8 4>)
+  ret <16 x i8> %1
+}
+
+; Test that indexes larger than the size of the vector are shown masked (bottom 4 bits).
+
+define <16 x i8> @test2(<16 x i8> %V) {
+; CHECK-LABEL: test2:
+; CHECK: pshufb {{.*}}# xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2]
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 15, i8 0, i8 0, i8 0, i8 0, i8 16, i8 0, i8 0, i8 0, i8 0, i8 17, i8 0, i8 0, i8 0, i8 0, i8 50>)
+  ret <16 x i8> %1
+}
+
+; Test that indexes with bit seven set are shown as zero.
+
+define <16 x i8> @test3(<16 x i8> %V) {
+; CHECK-LABEL: test3:
+; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,15,0,2,0,0],zero,xmm0[0,3,0,0],zero,xmm0[0,4]
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 127, i8 0, i8 2, i8 0, i8 0, i8 128, i8 0, i8 3, i8 0, i8 0, i8 255, i8 0, i8 4>)
+  ret <16 x i8> %1
+}
+
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
diff --git a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
index d8e4572..49d58f4 100644
--- a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
+++ b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
@@ -2,10 +2,12 @@
 ; Without the last chance recoloring, this test fails with:
 ; "ran out of registers".
 
-; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH
+; NOTE: With the fix to PR18883, we don't actually run out of registers here
+; any more, and so those checks are disabled. This test remains only for general coverage.
+; XXX: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH
 ; Test whether failure due to cutoff for depth is reported
 
-; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF
+; XXX: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF
 ; Test whether failure due to cutoff for interference is reported
 
 ; RUN: llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 -lcr-max-depth=0 -exhaustive-register-search < %s > %t 2>&1
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
new file mode 100644
index 0000000..83b86ac
--- /dev/null
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -0,0 +1,109 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+use-recip-est,+avx -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE
+
+; If the target's divss/divps instructions are substantially
+; slower than rcpss/rcpps with a Newton-Raphson refinement,
+; we should generate the estimate sequence.
+
+; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 )
+; for details about the accuracy, speed, and implementation
+; differences of x86 reciprocal estimates.
+
+define float @reciprocal_estimate(float %x) #0 {
+  %div = fdiv fast float 1.0, %x
+  ret float %div
+
+; CHECK-LABEL: reciprocal_estimate:
+; CHECK: movss
+; CHECK-NEXT: divss
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate:
+; BTVER2: vrcpss
+; BTVER2: vmulss
+; BTVER2: vsubss
+; BTVER2: vmulss
+; BTVER2: vaddss
+; BTVER2-NEXT: retq
+
+; REFINE-LABEL: reciprocal_estimate:
+; REFINE: vrcpss
+; REFINE: vmulss
+; REFINE: vsubss
+; REFINE: vmulss
+; REFINE: vaddss
+; REFINE: vmulss
+; REFINE: vsubss
+; REFINE: vmulss
+; REFINE: vaddss
+; REFINE-NEXT: retq
+}
+
+define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 {
+  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  ret <4 x float> %div
+
+; CHECK-LABEL: reciprocal_estimate_v4f32:
+; CHECK: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate_v4f32:
+; BTVER2: vrcpps
+; BTVER2: vmulps
+; BTVER2: vsubps
+; BTVER2: vmulps
+; BTVER2: vaddps
+; BTVER2-NEXT: retq
+
+; REFINE-LABEL: reciprocal_estimate_v4f32:
+; REFINE: vrcpps
+; REFINE: vmulps
+; REFINE: vsubps
+; REFINE: vmulps
+; REFINE: vaddps
+; REFINE: vmulps
+; REFINE: vsubps
+; REFINE: vmulps
+; REFINE: vaddps
+; REFINE-NEXT: retq
+}
+
+define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 {
+  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  ret <8 x float> %div
+
+; CHECK-LABEL: reciprocal_estimate_v8f32:
+; CHECK: movaps
+; CHECK: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: divps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate_v8f32:
+; BTVER2: vrcpps
+; BTVER2: vmulps
+; BTVER2: vsubps
+; BTVER2: vmulps
+; BTVER2: vaddps
+; BTVER2-NEXT: retq
+
+; REFINE-LABEL: reciprocal_estimate_v8f32:
+; REFINE: vrcpps
+; REFINE: vmulps
+; REFINE: vsubps
+; REFINE: vmulps
+; REFINE: vaddps
+; REFINE: vmulps
+; REFINE: vsubps
+; REFINE: vmulps
+; REFINE: vaddps
+; REFINE-NEXT: retq
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/return_zeroext_i2.ll b/test/CodeGen/X86/return_zeroext_i2.ll
new file mode 100644
index 0000000..d535b0c
--- /dev/null
+++ b/test/CodeGen/X86/return_zeroext_i2.ll
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple=i386-pc-win32 < %s | FileCheck %s 
+; Check that the testcase does not crash
+define zeroext i2 @crash () {
+  ret i2 0
+}
+; CHECK: xorl	%eax, %eax
+; CHECK-NEXT: retl
diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index b82be41..e34ba54 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -1,7 +1,9 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=X32ABI
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
@@ -61,6 +63,26 @@ false:
 ; X64-NEXT: callq __morestack_allocate_stack_space
 ; X64:      movq %rax, %rdi
 
+; X32ABI-LABEL:      test_basic:
+
+; X32ABI:      cmpl %fs:64, %esp
+; X32ABI-NEXT: ja      .LBB0_2
+
+; X32ABI:      movl $24, %r10d
+; X32ABI-NEXT: movl $0, %r11d
+; X32ABI-NEXT: callq __morestack
+; X32ABI-NEXT: ret
+
+; X32ABI:      movl %esp, %[[EDI:edi|eax]]
+; X32ABI:      subl %{{.*}}, %[[EDI]]
+; X32ABI-NEXT: cmpl %[[EDI]], %fs:64
+
+; X32ABI:      movl %[[EDI]], %esp
+
+; X32ABI:      movl %{{.*}}, %edi
+; X32ABI-NEXT: callq __morestack_allocate_stack_space
+; X32ABI:      movl %eax, %edi
+
 }
 
 attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 9dab3cd..2db7c11 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=X32ABI
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW
@@ -9,6 +10,7 @@
 ; We used to crash with filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj
@@ -51,6 +53,16 @@ define void @test_basic() #0 {
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
 
+; X32ABI-LABEL:       test_basic:
+
+; X32ABI:       cmpl %fs:64, %esp
+; X32ABI-NEXT:  ja      .LBB0_2
+
+; X32ABI:       movl $40, %r10d
+; X32ABI-NEXT:  movl $0, %r11d
+; X32ABI-NEXT:  callq __morestack
+; X32ABI-NEXT:  ret
+
 ; X32-Darwin-LABEL:      test_basic:
 
 ; X32-Darwin:      movl $432, %ecx
@@ -129,6 +141,16 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-Linux-NEXT:  ret
 ; X64-Linux-NEXT:  movq %rax, %r10
 
+; X32ABI:       cmpl %fs:64, %esp
+; X32ABI-NEXT:  ja      .LBB1_2
+
+; X32ABI:       movl %r10d, %eax
+; X32ABI-NEXT:  movl $56, %r10d
+; X32ABI-NEXT:  movl $0, %r11d
+; X32ABI-NEXT:  callq __morestack
+; X32ABI-NEXT:  ret
+; X32ABI-NEXT:  movq %rax, %r10
+
 ; X32-Darwin:      movl $432, %edx
 ; X32-Darwin-NEXT: cmpl %gs:(%edx), %esp
 ; X32-Darwin-NEXT: ja      LBB1_2
@@ -202,6 +224,15 @@ define void @test_large() #0 {
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
 
+; X32ABI:       leal -40008(%rsp), %r11d
+; X32ABI-NEXT:  cmpl %fs:64, %r11d
+; X32ABI-NEXT:  ja      .LBB2_2
+
+; X32ABI:       movl $40008, %r10d
+; X32ABI-NEXT:  movl $0, %r11d
+; X32ABI-NEXT:  callq __morestack
+; X32ABI-NEXT:  ret
+
 ; X32-Darwin:      leal -40012(%esp), %ecx
 ; X32-Darwin-NEXT: movl $432, %eax
 ; X32-Darwin-NEXT: cmpl %gs:(%eax), %ecx
@@ -276,6 +307,16 @@ define fastcc void @test_fastcc() #0 {
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
 
+; X32ABI-LABEL:       test_fastcc:
+
+; X32ABI:       cmpl %fs:64, %esp
+; X32ABI-NEXT:  ja      .LBB3_2
+
+; X32ABI:       movl $40, %r10d
+; X32ABI-NEXT:  movl $0, %r11d
+; X32ABI-NEXT:  callq __morestack
+; X32ABI-NEXT:  ret
+
 ; X32-Darwin-LABEL:      test_fastcc:
 
 ; X32-Darwin:      movl $432, %eax
@@ -356,6 +397,17 @@ define fastcc void @test_fastcc_large() #0 {
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
 
+; X32ABI-LABEL:       test_fastcc_large:
+
+; X32ABI:       leal -40008(%rsp), %r11d
+; X32ABI-NEXT:  cmpl %fs:64, %r11d
+; X32ABI-NEXT:  ja      .LBB4_2
+
+; X32ABI:       movl $40008, %r10d
+; X32ABI-NEXT:  movl $0, %r11d
+; X32ABI-NEXT:  callq __morestack
+; X32ABI-NEXT:  ret
+
 ; X32-Darwin-LABEL:      test_fastcc_large:
 
 ; X32-Darwin:      leal -40012(%esp), %eax
@@ -446,6 +498,9 @@ define void @test_nostack() #0 {
 ; X64-Linux-LABEL: test_nostack:
 ; X32-Linux-NOT:   callq __morestack
 
+; X32ABI-LABEL: test_nostack:
+; X32ABI-NOT:   callq __morestack
+
 ; X32-Darwin-LABEL: test_nostack:
 ; X32-Darwin-NOT:   calll __morestack
 
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index cdd258d..7e6f153 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -357,3 +357,47 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
 ; ATOM: cmpl $15, %edi
 ; ATOM: cmovgel %edx
 }
+
+; CHECK-LABEL: @trunc_select_miscompile
+; CHECK-NOT: sarb
+define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) {
+  %tmp1 = select i1 %cc, i32 3, i32 2
+  %tmp2 = shl i32 %a, %tmp1
+  ret i32 %tmp2
+}
+
+define void @test19() {
+; This is a massive reduction of an llvm-stress test case that generates
+; interesting chains feeding setcc and eventually a f32 select operation. This
+; is intended to exercise the SELECT formation in the DAG combine simplifying
+; a simplified select_cc node. If it it regresses and is no longer triggering
+; that code path, it can be deleted.
+;
+; CHECK-LABEL: @test19
+; CHECK: testb
+; CHECK: cmpl
+; CHECK: ucomiss
+
+BB:
+  br label %CF
+
+CF:
+  %Cmp10 = icmp ule i8 undef, undef
+  br i1 %Cmp10, label %CF, label %CF250
+
+CF250:
+  %E12 = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 2
+  %Cmp32 = icmp ugt i1 %Cmp10, false
+  br i1 %Cmp32, label %CF, label %CF242
+
+CF242:
+  %Cmp38 = icmp uge i32 %E12, undef
+  %FC = uitofp i1 %Cmp38 to float
+  %Sl59 = select i1 %Cmp32, float %FC, float undef
+  %Cmp60 = fcmp ugt float undef, undef
+  br i1 %Cmp60, label %CF242, label %CF244
+
+CF244:
+  %B122 = fadd float %Sl59, undef
+  ret void
+}
diff --git a/test/CodeGen/X86/sext-i1.ll b/test/CodeGen/X86/sext-i1.ll
index 64de0ae..1a575db 100644
--- a/test/CodeGen/X86/sext-i1.ll
+++ b/test/CodeGen/X86/sext-i1.ll
@@ -61,3 +61,36 @@ if.end:                                           ; preds = %if.then, %entry
   %xor27 = xor i32 undef, %cond                   ; <i32> [#uses=0]
   ret i32 0
 }
+
+define i32 @t4(i64 %x) nounwind readnone ssp {
+entry:
+; 32-LABEL: t4:
+; 32: movl
+; 32: orl
+; 32: movl
+; 32: je
+; 32: xorl
+
+; 64-LABEL: t4:
+; 64: cmpq $1
+; 64: sbbl
+  %0 = icmp eq i64 %x, 0
+  %1 = sext i1 %0 to i32
+  ret i32 %1
+}
+
+define i64 @t5(i32 %x) nounwind readnone ssp {
+entry:
+; 32-LABEL: t5:
+; 32: cmpl $1
+; 32: sbbl
+; 32: movl
+
+; 64-LABEL: t5:
+; 64: cmpl $1
+; 64: sbbq
+  %0 = icmp eq i32 %x, 0
+  %1 = sext i1 %0 to i64
+  ret i64 %1
+}
+
diff --git a/test/CodeGen/X86/shift-parts.ll b/test/CodeGen/X86/shift-parts.ll
index ddad307..763da63 100644
--- a/test/CodeGen/X86/shift-parts.ll
+++ b/test/CodeGen/X86/shift-parts.ll
@@ -7,13 +7,13 @@
 
 ; CHECK: shrdq
 
-define i32 @int87(i32 %uint64p_8) nounwind {
+define i32 @int87(i32 %uint64p_8, i1 %cond) nounwind {
 entry:
   %srcval4 = load i320* bitcast (%0* @g_144 to i320*), align 8 ; <i320> [#uses=1]
   br label %for.cond
 
 for.cond:                                         ; preds = %for.cond, %entry
-  %call3.in.in.in.v = select i1 undef, i320 192, i320 128 ; <i320> [#uses=1]
+  %call3.in.in.in.v = select i1 %cond, i320 192, i320 128 ; <i320> [#uses=1]
   %call3.in.in.in = lshr i320 %srcval4, %call3.in.in.in.v ; <i320> [#uses=1]
   %call3.in = trunc i320 %call3.in.in.in to i32   ; <i32> [#uses=1]
   %tobool = icmp eq i32 %call3.in, 0              ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/shuffle-combine-crash.ll b/test/CodeGen/X86/shuffle-combine-crash.ll
new file mode 100644
index 0000000..6ab7b97
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-combine-crash.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 
+
+; Verify that DAGCombiner does not crash when checking if it is
+; safe to fold the shuffles in function @sample_test according to rule
+;  (shuffle (shuffle A, Undef, M0), Undef, M1) -> (shuffle A, Undef, M2)
+;
+; The DAGCombiner avoids folding shuffles if
+; the resulting shuffle dag node is not legal for the target.
+; That means, the shuffle must have legal type and legal mask.
+;
+; Before, the DAGCombiner forgot to check if the resulting shuffle
+; was legal. It instead just called method
+; 'X86TargetLowering::isShuffleMaskLegal'; however, that was not enough since
+; that method always expect to have a valid vector type in input.
+; As a consequence, compiling the function below would have caused a crash.
+
+define void @sample_test() {
+  br i1 undef, label %5, label %1
+
+; <label>:1                                       ; preds = %0
+  %2 = load <4 x i8>* undef
+  %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+  %4 = shufflevector <4 x i8> %3, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  store <4 x i8> %4, <4 x i8>* undef
+  br label %5
+
+; <label>:5                                       ; preds = %1, %0
+  ret void
+}
+
diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll
index 2dc8816..1e34a2b 100644
--- a/test/CodeGen/X86/sincos-opt.ll
+++ b/test/CodeGen/X86/sincos-opt.ll
@@ -15,7 +15,8 @@ entry:
 
 ; OSX_SINCOS-LABEL: test1:
 ; OSX_SINCOS: callq ___sincosf_stret
-; OSX_SINCOS: pshufd $1, %xmm0, %xmm1
+; OSX_SINCOS: movaps %xmm0, %xmm1
+; OSX_SINCOS: shufps {{.*}} ## xmm1 = xmm1[1,1,2,3]
 ; OSX_SINCOS: addss %xmm0, %xmm1
 
 ; OSX_NOOPT: test1
diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll
new file mode 100644
index 0000000..6e3a003
--- /dev/null
+++ b/test/CodeGen/X86/sink-blockfreq.ll
@@ -0,0 +1,45 @@
+; RUN: llc -disable-machine-licm -machine-sink-bfi=true -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_BFI
+; RUN: llc -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI
+
+; Test that by changing BlockFrequencyInfo we change the order in which
+; machine-sink looks for sucessor blocks. By not using BFI, both G and B
+; have the same loop depth and no instructions is sinked - B is selected but
+; can't be used as to avoid breaking a non profitable critical edge. By using
+; BFI, "mul" is sinked into the less frequent block G.
+define i32 @sink_freqinfo(i32 %a, i32 %b) nounwind uwtable ssp {
+; MSINK_BFI-LABEL: sink_freqinfo
+; MSINK_BFI: jl
+; MSINK_BFI-NEXT: ## BB#
+; MSINK_BFI-NEXT: imull
+
+; MSINK_NOBFI-LABEL: sink_freqinfo
+; MSINK_NOBFI: imull
+; MSINK_NOBFI: jl
+entry:
+  br label %B
+
+B:
+  %ee = phi i32 [ 0, %entry ], [ %inc, %F ]
+  %xx = sub i32 %a, %ee
+  %cond0 = icmp slt i32 %xx, 0
+  br i1 %cond0, label %F, label %exit, !prof !0
+
+F:
+  %inc = add nsw i32 %xx, 2
+  %aa = mul nsw i32 %b, %inc
+  %exitcond = icmp slt i32 %inc, %a
+  br i1 %exitcond, label %B, label %G, !prof !1
+
+G:
+  %ii = add nsw i32 %aa, %a
+  %ll = add i32 %b, 45
+  %exitcond2 = icmp sge i32 %ii, %b
+  br i1 %exitcond2, label %G, label %exit, !prof !2
+
+exit:
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 4, i32 1}
+!1 = metadata !{metadata !"branch_weights", i32 128, i32 1}
+!2 = metadata !{metadata !"branch_weights", i32 1, i32 1}
diff --git a/test/CodeGen/X86/sink-out-of-loop.ll b/test/CodeGen/X86/sink-out-of-loop.ll
index c600f92..6757f31 100644
--- a/test/CodeGen/X86/sink-out-of-loop.ll
+++ b/test/CodeGen/X86/sink-out-of-loop.ll
@@ -5,7 +5,7 @@
 ; MOV32ri outside the loop.
 ; rdar://11980766
 define i32 @sink_succ(i32 %argc, i8** nocapture %argv) nounwind uwtable ssp {
-; CHECK: sink_succ
+; CHECK-LABEL: sink_succ
 ; CHECK: [[OUTER_LN1:LBB0_[0-9]+]]: ## %preheader
 ; CHECK: %exit
 ; CHECK-NOT: movl
@@ -52,3 +52,24 @@ for.body2:
 for.end20:
   ret i32 0
 }
+
+define i32 @sink_out_of_loop(i32 %n, i32* %output) {
+; CHECK-LABEL: sink_out_of_loop:
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i2, %loop ]
+  %j = mul i32 %i, %i
+  %addr = getelementptr i32* %output, i32 %i
+  store i32 %i, i32* %addr
+  %i2 = add i32 %i, 1
+  %exit_cond = icmp sge i32 %i2, %n
+  br i1 %exit_cond, label %exit, label %loop
+
+exit:
+; CHECK: BB#2
+; CHECK: imull %eax, %eax
+; CHECK: retq
+  ret i32 %j
+}
diff --git a/test/CodeGen/X86/slow-incdec.ll b/test/CodeGen/X86/slow-incdec.ll
new file mode 100644
index 0000000..541d992
--- /dev/null
+++ b/test/CodeGen/X86/slow-incdec.ll
@@ -0,0 +1,80 @@
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-slow-incdec < %s | FileCheck -check-prefix=INCDEC %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+slow-incdec < %s | FileCheck -check-prefix=ADD %s
+
+; check -mattr=-slow-incdec
+; INCDEC-NOT: addl $-1
+; INCDEC: dec
+; INCDEC-NOT: addl $1
+; INCDEC: inc
+
+; check -mattr=+slow-incdec
+; ADD: addl $-1
+; ADD-NOT: dec
+; ADD: addl $1
+; ADD-NOT: inc
+
+; Function Attrs: nounwind readonly
+define i32 @slow_1(i32* nocapture readonly %a, i32 %s) #0 {
+entry:
+  %cmp5 = icmp eq i32 %s, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %i.06 = phi i32 [ %dec, %for.cond ], [ %s, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32* %a, i32 %i.06
+  %0 = load i32* %arrayidx, align 4, !tbaa !1
+  %cmp1 = icmp eq i32 %0, 0
+;
+  %dec = add nsw i32 %i.06, -1
+  br i1 %cmp1, label %for.end.loopexit, label %for.cond
+
+for.end.loopexit:                                 ; preds = %for.cond, %for.body
+  %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Function Attrs: nounwind readonly
+define i32 @slow_2(i32* nocapture readonly %a, i32 %s) #0 {
+entry:
+  %cmp5 = icmp eq i32 %s, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i32 %inc, 0
+  br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %i.06 = phi i32 [ %inc, %for.cond ], [ %s, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32* %a, i32 %i.06
+  %0 = load i32* %arrayidx, align 4, !tbaa !1
+  %cmp1 = icmp eq i32 %0, 0
+  %inc = add nsw i32 %i.06, 1
+  br i1 %cmp1, label %for.end.loopexit, label %for.cond
+
+for.end.loopexit:                                 ; preds = %for.cond, %for.body
+  %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll
new file mode 100644
index 0000000..c052ad2
--- /dev/null
+++ b/test/CodeGen/X86/splat-for-size.ll
@@ -0,0 +1,141 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX2
+
+; Check constant loads of every 128-bit and 256-bit vector type 
+; for size optimization using splat ops available with AVX and AVX2.
+
+; There is no AVX broadcast from double to 128-bit vector because movddup has been around since SSE3 (grrr).
+define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
+  %add = fadd <2 x double> %x, <double 1.0, double 1.0>
+  ret <2 x double> %add
+; CHECK-LABEL: splat_v2f64
+; CHECK: vmovddup
+; CHECK: vaddpd 
+; CHECK-NEXT: retq
+}
+
+define <4 x double> @splat_v4f64(<4 x double> %x) #0 {
+  %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  ret <4 x double> %add
+; CHECK-LABEL: splat_v4f64
+; CHECK: vbroadcastsd 
+; CHECK-NEXT: vaddpd
+; CHECK-NEXT: retq
+}
+
+define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
+  %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <4 x float> %add
+; CHECK-LABEL: splat_v4f32
+; CHECK: vbroadcastss 
+; CHECK-NEXT: vaddps
+; CHECK-NEXT: retq
+}
+
+define <8 x float> @splat_v8f32(<8 x float> %x) #0 {
+  %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <8 x float> %add
+; CHECK-LABEL: splat_v8f32
+; CHECK: vbroadcastss 
+; CHECK-NEXT: vaddps
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value.
+; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq.
+define <2 x i64> @splat_v2i64(<2 x i64> %x) #0 {
+  %add = add <2 x i64> %x, <i64 1, i64 1>
+  ret <2 x i64> %add
+; CHECK-LABEL: splat_v2i64
+; CHECK: vmovddup 
+; CHECK: vpaddq
+; CHECK-NEXT: retq
+}
+
+; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors,
+; and then we fake it: use vmovddup to splat 64-bit value.
+define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
+  %add = add <4 x i64> %x, <i64 1, i64 1, i64 1, i64 1>
+  ret <4 x i64> %add
+; CHECK-LABEL: splat_v4i64
+; AVX: vmovddup
+; AVX: vpaddq 
+; AVX: vpaddq 
+; AVX2: vpbroadcastq 
+; AVX2: vpaddq 
+; CHECK: retq
+}
+
+; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
+define <4 x i32> @splat_v4i32(<4 x i32> %x) #0 {
+  %add = add <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %add
+; CHECK-LABEL: splat_v4i32
+; AVX: vbroadcastss
+; AVX2: vpbroadcastd 
+; CHECK-NEXT: vpaddd 
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
+define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
+  %add = add <8 x i32> %x, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %add
+; CHECK-LABEL: splat_v8i32
+; AVX: vbroadcastss
+; AVX: vpaddd 
+; AVX: vpaddd 
+; AVX2: vpbroadcastd 
+; AVX2: vpaddd 
+; CHECK: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
+define <8 x i16> @splat_v8i16(<8 x i16> %x) #0 {
+  %add = add <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %add
+; CHECK-LABEL: splat_v8i16
+; AVX-NOT: broadcast
+; AVX2: vpbroadcastw 
+; CHECK: vpaddw 
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
+define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
+  %add = add <16 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <16 x i16> %add
+; CHECK-LABEL: splat_v16i16
+; AVX-NOT: broadcast
+; AVX: vpaddw 
+; AVX: vpaddw 
+; AVX2: vpbroadcastw 
+; AVX2: vpaddw 
+; CHECK: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
+define <16 x i8> @splat_v16i8(<16 x i8> %x) #0 {
+  %add = add <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %add
+; CHECK-LABEL: splat_v16i8
+; AVX-NOT: broadcast
+; AVX2: vpbroadcastb 
+; CHECK: vpaddb 
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
+define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
+  %add = add <32 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <32 x i8> %add
+; CHECK-LABEL: splat_v32i8
+; AVX-NOT: broadcast
+; AVX: vpaddb 
+; AVX: vpaddb 
+; AVX2: vpbroadcastb 
+; AVX2: vpaddb 
+; CHECK: retq
+}
+
+attributes #0 = { optsize }
diff --git a/test/CodeGen/X86/splat-scalar-load.ll b/test/CodeGen/X86/splat-scalar-load.ll
deleted file mode 100644
index 4d59b9c..0000000
--- a/test/CodeGen/X86/splat-scalar-load.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -mcpu=nehalem | FileCheck %s
-; rdar://7434544
-
-define <2 x i64> @t2() nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: pshufd	$85, (%esp), %xmm0
-  %array = alloca [8 x float], align 4
-  %arrayidx = getelementptr inbounds [8 x float]* %array, i32 0, i32 1
-  %tmp2 = load float* %arrayidx
-  %vecinit = insertelement <4 x float> undef, float %tmp2, i32 0
-  %vecinit5 = insertelement <4 x float> %vecinit, float %tmp2, i32 1
-  %vecinit7 = insertelement <4 x float> %vecinit5, float %tmp2, i32 2
-  %vecinit9 = insertelement <4 x float> %vecinit7, float %tmp2, i32 3
-  %0 = bitcast <4 x float> %vecinit9 to <2 x i64>
-  ret <2 x i64> %0
-}
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index fc79e31..24b175e 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
 
 ; generated using "clang -S -O2 -ffast-math -emit-llvm sqrt.c" from
 ; #include <math.h>
@@ -52,9 +53,80 @@ entry:
   ret x86_fp80 %call
 }
 
-; Function Attrs: nounwind readnone
 declare x86_fp80 @__sqrtl_finite(x86_fp80) #1
 
+declare float @llvm.sqrt.f32(float) #1
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
+
+; If the target's sqrtss and divss instructions are substantially
+; slower than rsqrtss with a Newton-Raphson refinement, we should
+; generate the estimate sequence.
+
+define float @reciprocal_square_root(float %x) #0 {
+  %sqrt = tail call float @llvm.sqrt.f32(float %x)
+  %div = fdiv fast float 1.0, %sqrt
+  ret float %div
+
+; CHECK-LABEL: reciprocal_square_root:
+; CHECK: sqrtss
+; CHECK-NEXT: movss
+; CHECK-NEXT: divss
+; CHECK-NEXT: retq
+; BTVER2-LABEL: reciprocal_square_root:
+; BTVER2: vrsqrtss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vaddss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: retq
+}
+
+define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
+  %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
+  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
+  ret <4 x float> %div
+
+; CHECK-LABEL: reciprocal_square_root_v4f32:
+; CHECK: sqrtps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: retq
+; BTVER2-LABEL: reciprocal_square_root_v4f32:
+; BTVER2: vrsqrtps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vaddps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: retq
+}
+
+define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 {
+  %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
+  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
+  ret <8 x float> %div
+
+; CHECK-LABEL: reciprocal_square_root_v8f32:
+; CHECK: sqrtps
+; CHECK-NEXT: sqrtps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: divps
+; CHECK-NEXT: retq
+; BTVER2-LABEL: reciprocal_square_root_v8f32:
+; BTVER2: vrsqrtps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vaddps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: retq
+}
+
+
 attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll
index 2351fd6..396da0f 100644
--- a/test/CodeGen/X86/sse-align-12.ll
+++ b/test/CodeGen/X86/sse-align-12.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -march=x86-64 -mcpu=nehalem | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=nehalem | FileCheck %s
 
-; CHECK-LABEL: a:
-; CHECK: movdqu
-; CHECK: pshufd
 define <4 x float> @a(<4 x float>* %y) nounwind {
+; CHECK-LABEL: a:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movups (%rdi), %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT:    retq
   %x = load <4 x float>* %y, align 4
   %a = extractelement <4 x float> %x, i32 0
   %b = extractelement <4 x float> %x, i32 1
@@ -16,10 +18,12 @@ define <4 x float> @a(<4 x float>* %y) nounwind {
   ret <4 x float> %s
 }
 
-; CHECK-LABEL: b:
-; CHECK: movups
-; CHECK: unpckhps
 define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind {
+; CHECK-LABEL: b:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movups (%rdi), %xmm1
+; CHECK-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    retq
   %x = load <4 x float>* %y, align 4
   %a = extractelement <4 x float> %x, i32 2
   %b = extractelement <4 x float> %x, i32 3
@@ -32,10 +36,12 @@ define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind {
   ret <4 x float> %s
 }
 
-; CHECK-LABEL: c:
-; CHECK: movupd
-; CHECK: shufpd
 define <2 x double> @c(<2 x double>* %y) nounwind {
+; CHECK-LABEL: c:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movupd (%rdi), %xmm0
+; CHECK-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    retq
   %x = load <2 x double>* %y, align 8
   %a = extractelement <2 x double> %x, i32 0
   %c = extractelement <2 x double> %x, i32 1
@@ -44,10 +50,12 @@ define <2 x double> @c(<2 x double>* %y) nounwind {
   ret <2 x double> %r
 }
 
-; CHECK-LABEL: d:
-; CHECK: movupd
-; CHECK: unpckhpd
 define <2 x double> @d(<2 x double>* %y, <2 x double> %z) nounwind {
+; CHECK-LABEL: d:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movupd (%rdi), %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-NEXT:    retq
   %x = load <2 x double>* %y, align 8
   %a = extractelement <2 x double> %x, i32 1
   %c = extractelement <2 x double> %z, i32 1
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 5122c44..da36a42 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -138,8 +138,7 @@ define double @ole_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      ogt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ogt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -157,8 +156,7 @@ define double @ogt_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      olt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      olt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -177,8 +175,7 @@ define double @olt_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      ogt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: minsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ogt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -198,8 +195,7 @@ define double @ogt_inverse_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      olt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      olt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -217,8 +213,7 @@ define double @olt_inverse_x(double %x) nounwind {
 ; CHECK-NEXT: andpd
 ; UNSAFE-LABEL:      oge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      oge_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -235,8 +230,7 @@ define double @oge_x(double %x) nounwind {
 ; CHECK-NEXT: andpd
 ; UNSAFE-LABEL:      ole_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ole_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -253,8 +247,7 @@ define double @ole_x(double %x) nounwind {
 ; CHECK-NEXT: andnpd
 ; UNSAFE-LABEL:      oge_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      oge_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -271,8 +264,7 @@ define double @oge_inverse_x(double %x) nounwind {
 ; CHECK:      cmplesd %xmm
 ; UNSAFE-LABEL:      ole_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ole_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -412,8 +404,7 @@ define double @ule_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: andpd
 ; UNSAFE-LABEL:      ugt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ugt_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -430,8 +421,7 @@ define double @ugt_x(double %x) nounwind {
 ; CHECK-NEXT: andpd
 ; UNSAFE-LABEL:      ult_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ult_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -448,8 +438,7 @@ define double @ult_x(double %x) nounwind {
 ; CHECK-NEXT: andnpd
 ; UNSAFE-LABEL:      ugt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ugt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -467,8 +456,7 @@ define double @ugt_inverse_x(double %x) nounwind {
 ; CHECK-NEXT: andnpd
 ; UNSAFE-LABEL:      ult_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ult_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -488,8 +476,7 @@ define double @ult_inverse_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      uge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      uge_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -508,8 +495,7 @@ define double @uge_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      ule_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: minsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ule_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -527,8 +513,7 @@ define double @ule_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      uge_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      uge_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -547,8 +532,7 @@ define double @uge_inverse_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      ule_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ule_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll
deleted file mode 100644
index 600ee1b..0000000
--- a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll
+++ /dev/null
@@ -1,423 +0,0 @@
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
-; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
-
-; Ensure that the backend selects SSE/AVX scalar fp instructions
-; from a packed fp instrution plus a vector insert.
-
-
-define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fadd <4 x float> %a, %b
-  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test_add_ss
-; SSE2: addss   %xmm1, %xmm0
-; AVX: vaddss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fsub <4 x float> %a, %b
-  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test_sub_ss
-; SSE2: subss   %xmm1, %xmm0
-; AVX: vsubss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fmul <4 x float> %a, %b
-  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test_mul_ss
-; SSE2: mulss   %xmm1, %xmm0
-; AVX: vmulss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fdiv <4 x float> %a, %b
-  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test_div_ss
-; SSE2: divss   %xmm1, %xmm0
-; AVX: vdivss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fadd <2 x double> %a, %b
-  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test_add_sd
-; SSE2: addsd   %xmm1, %xmm0
-; AVX: vaddsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fsub <2 x double> %a, %b
-  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test_sub_sd
-; SSE2: subsd   %xmm1, %xmm0
-; AVX: vsubsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fmul <2 x double> %a, %b
-  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test_mul_sd
-; SSE2: mulsd   %xmm1, %xmm0
-; AVX: vmulsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fdiv <2 x double> %a, %b
-  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test_div_sd
-; SSE2: divsd   %xmm1, %xmm0
-; AVX: vdivsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fadd <4 x float> %b, %a
-  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test2_add_ss
-; SSE2: addss   %xmm0, %xmm1
-; AVX: vaddss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fsub <4 x float> %b, %a
-  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test2_sub_ss
-; SSE2: subss   %xmm0, %xmm1
-; AVX: vsubss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fmul <4 x float> %b, %a
-  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test2_mul_ss
-; SSE2: mulss   %xmm0, %xmm1
-; AVX: vmulss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fdiv <4 x float> %b, %a
-  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test2_div_ss
-; SSE2: divss   %xmm0, %xmm1
-; AVX: vdivss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fadd <2 x double> %b, %a
-  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test2_add_sd
-; SSE2: addsd   %xmm0, %xmm1
-; AVX: vaddsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fsub <2 x double> %b, %a
-  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test2_sub_sd
-; SSE2: subsd   %xmm0, %xmm1
-; AVX: vsubsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fmul <2 x double> %b, %a
-  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test2_mul_sd
-; SSE2: mulsd   %xmm0, %xmm1
-; AVX: vmulsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fdiv <2 x double> %b, %a
-  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test2_div_sd
-; SSE2: divsd   %xmm0, %xmm1
-; AVX: vdivsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <4 x float> @test3_add_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fadd <4 x float> %a, %b
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test3_add_ss
-; SSE2: addss   %xmm1, %xmm0
-; AVX: vaddss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test3_sub_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fsub <4 x float> %a, %b
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test3_sub_ss
-; SSE2: subss   %xmm1, %xmm0
-; AVX: vsubss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test3_mul_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fmul <4 x float> %a, %b
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test3_mul_ss
-; SSE2: mulss   %xmm1, %xmm0
-; AVX: vmulss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test3_div_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fdiv <4 x float> %a, %b
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test3_div_ss
-; SSE2: divss   %xmm1, %xmm0
-; AVX: vdivss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <2 x double> @test3_add_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fadd <2 x double> %a, %b
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test3_add_sd
-; SSE2: addsd   %xmm1, %xmm0
-; AVX: vaddsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test3_sub_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fsub <2 x double> %a, %b
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test3_sub_sd
-; SSE2: subsd   %xmm1, %xmm0
-; AVX: vsubsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test3_mul_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fmul <2 x double> %a, %b
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test3_mul_sd
-; SSE2: mulsd   %xmm1, %xmm0
-; AVX: vmulsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test3_div_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fdiv <2 x double> %a, %b
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test3_div_sd
-; SSE2: divsd   %xmm1, %xmm0
-; AVX: vdivsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <4 x float> @test4_add_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fadd <4 x float> %b, %a
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test4_add_ss
-; SSE2: addss   %xmm0, %xmm1
-; AVX: vaddss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test4_sub_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fsub <4 x float> %b, %a
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test4_sub_ss
-; SSE2: subss   %xmm0, %xmm1
-; AVX: vsubss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test4_mul_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fmul <4 x float> %b, %a
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test4_mul_ss
-; SSE2: mulss   %xmm0, %xmm1
-; AVX: vmulss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test4_div_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fdiv <4 x float> %b, %a
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test4_div_ss
-; SSE2: divss   %xmm0, %xmm1
-; AVX: vdivss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <2 x double> @test4_add_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fadd <2 x double> %b, %a
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test4_add_sd
-; SSE2: addsd   %xmm0, %xmm1
-; AVX: vaddsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test4_sub_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fsub <2 x double> %b, %a
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test4_sub_sd
-; SSE2: subsd   %xmm0, %xmm1
-; AVX: vsubsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test4_mul_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fmul <2 x double> %b, %a
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test4_mul_sd
-; SSE2: mulsd   %xmm0, %xmm1
-; AVX: vmulsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test4_div_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fdiv <2 x double> %b, %a
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test4_div_sd
-; SSE2: divsd   %xmm0, %xmm1
-; AVX: vdivsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 3949a83..b122ef6 100644
--- a/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1,13 +1,23 @@
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
-; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
+; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s
+; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s
+; RUN: llc -mcpu=x86-64 -mattr=+avx < %s | FileCheck --check-prefix=AVX %s
+
+target triple = "x86_64-unknown-unknown"
 
 ; Ensure that the backend no longer emits unnecessary vector insert
 ; instructions immediately after SSE scalar fp instructions
 ; like addss or mulss.
 
-
 define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %add = fadd float %2, %1
@@ -15,14 +25,16 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_add_ss
-; SSE2: addss   %xmm1, %xmm0
-; AVX: vaddss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %sub = fsub float %2, %1
@@ -30,13 +42,16 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_sub_ss
-; SSE2: subss   %xmm1, %xmm0
-; AVX: vsubss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
 define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %mul = fmul float %2, %1
@@ -44,14 +59,16 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_mul_ss
-; SSE2: mulss   %xmm1, %xmm0
-; AVX: vmulss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %div = fdiv float %2, %1
@@ -59,14 +76,16 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_div_ss
-; SSE2: divss   %xmm1, %xmm0
-; AVX: vdivss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %b, i32 0
   %2 = extractelement <2 x double> %a, i32 0
   %add = fadd double %2, %1
@@ -74,14 +93,16 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test_add_sd
-; SSE2: addsd   %xmm1, %xmm0
-; AVX: vaddsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %b, i32 0
   %2 = extractelement <2 x double> %a, i32 0
   %sub = fsub double %2, %1
@@ -89,14 +110,16 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test_sub_sd
-; SSE2: subsd   %xmm1, %xmm0
-; AVX: vsubsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %b, i32 0
   %2 = extractelement <2 x double> %a, i32 0
   %mul = fmul double %2, %1
@@ -104,14 +127,16 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test_mul_sd
-; SSE2: mulsd   %xmm1, %xmm0
-; AVX: vmulsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %b, i32 0
   %2 = extractelement <2 x double> %a, i32 0
   %div = fdiv double %2, %1
@@ -119,14 +144,17 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test_div_sd
-; SSE2: divsd   %xmm1, %xmm0
-; AVX: vdivsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %a, i32 0
   %2 = extractelement <4 x float> %b, i32 0
   %add = fadd float %1, %2
@@ -134,14 +162,17 @@ define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test2_add_ss
-; SSE2: addss   %xmm0, %xmm1
-; AVX: vaddss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %a, i32 0
   %2 = extractelement <4 x float> %b, i32 0
   %sub = fsub float %2, %1
@@ -149,14 +180,17 @@ define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test2_sub_ss
-; SSE2: subss   %xmm0, %xmm1
-; AVX: vsubss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %a, i32 0
   %2 = extractelement <4 x float> %b, i32 0
   %mul = fmul float %1, %2
@@ -164,14 +198,17 @@ define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test2_mul_ss
-; SSE2: mulss   %xmm0, %xmm1
-; AVX: vmulss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %a, i32 0
   %2 = extractelement <4 x float> %b, i32 0
   %div = fdiv float %2, %1
@@ -179,14 +216,17 @@ define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test2_div_ss
-; SSE2: divss   %xmm0, %xmm1
-; AVX: vdivss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test2_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %a, i32 0
   %2 = extractelement <2 x double> %b, i32 0
   %add = fadd double %1, %2
@@ -194,14 +234,17 @@ define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test2_add_sd
-; SSE2: addsd   %xmm0, %xmm1
-; AVX: vaddsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test2_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %a, i32 0
   %2 = extractelement <2 x double> %b, i32 0
   %sub = fsub double %2, %1
@@ -209,14 +252,17 @@ define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test2_sub_sd
-; SSE2: subsd   %xmm0, %xmm1
-; AVX: vsubsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test2_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %a, i32 0
   %2 = extractelement <2 x double> %b, i32 0
   %mul = fmul double %1, %2
@@ -224,14 +270,17 @@ define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test2_mul_sd
-; SSE2: mulsd   %xmm0, %xmm1
-; AVX: vmulsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test2_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %a, i32 0
   %2 = extractelement <2 x double> %b, i32 0
   %div = fdiv double %2, %1
@@ -239,14 +288,18 @@ define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test2_div_sd
-; SSE2: divsd   %xmm0, %xmm1
-; AVX: vdivsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_multiple_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm0, %xmm1
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_multiple_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %add = fadd float %2, %1
@@ -255,14 +308,19 @@ define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_multiple_add_ss
-; CHECK: addss
-; CHECK: addss
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_multiple_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    subss %xmm1, %xmm2
+; SSE-NEXT:    subss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_multiple_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %sub = fsub float %2, %1
@@ -271,14 +329,18 @@ define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_multiple_sub_ss
-; CHECK: subss
-; CHECK: subss
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_multiple_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm0, %xmm1
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_multiple_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %mul = fmul float %2, %1
@@ -287,13 +349,19 @@ define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_multiple_mul_ss
-; CHECK: mulss
-; CHECK: mulss
-; CHECK-NOT: movss
-; CHECK: ret
-
 define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_multiple_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    divss %xmm1, %xmm2
+; SSE-NEXT:    divss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_multiple_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %div = fdiv float %2, %1
@@ -302,9 +370,501 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_multiple_div_ss
-; CHECK: divss
-; CHECK: divss
-; CHECK-NOT: movss
-; CHECK: ret
+; Ensure that the backend selects SSE/AVX scalar fp instructions
+; from a packed fp instrution plus a vector insert.
+
+define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test2_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test2_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test2_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test2_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test2_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test2_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test2_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test2_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test3_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test3_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test3_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
 
+define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test3_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test3_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test3_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test3_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test3_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test4_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test4_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test4_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test4_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test4_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test4_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test4_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test4_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll
index 183297e..fd35e75 100644
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll
@@ -1,17 +1,6 @@
 ; Tests for SSE1 and below, without SSE2+.
-; RUN: llc < %s -march=x86 -mcpu=pentium3 -O3 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=-sse2,+sse -O3 | FileCheck %s
-
-define <8 x i16> @test1(<8 x i32> %a) nounwind {
-; CHECK: test1
-  ret <8 x i16> zeroinitializer
-}
-
-define <8 x i16> @test2(<8 x i32> %a) nounwind {
-; CHECK: test2
-  %c = trunc <8 x i32> %a to <8 x i16>            ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %c
-}
+; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mcpu=pentium3 -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=-sse2,+sse -O3 | FileCheck %s
 
 ; PR7993
 ;define <4 x i32> @test3(<4 x i16> %a) nounwind {
@@ -23,6 +12,15 @@ define <8 x i16> @test2(<8 x i32> %a) nounwind {
 ; vector that this ends up returning.
 ; rdar://8368414
 define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movaps %xmm0, %xmm2
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; CHECK-NEXT:    addss %xmm1, %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT:    subss %xmm1, %xmm2
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    ret
 entry:
   %tmp7 = extractelement <2 x float> %A, i32 0
   %tmp5 = extractelement <2 x float> %A, i32 1
@@ -33,15 +31,6 @@ entry:
   %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
   %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
   ret <2 x float> %tmp9
-; CHECK-LABEL: test4:
-; CHECK-NOT: shufps	$16
-; CHECK: shufps	$1, 
-; CHECK-NOT: shufps	$16
-; CHECK: shufps	$1, 
-; CHECK-NOT: shufps	$16
-; CHECK: unpcklps
-; CHECK-NOT: shufps	$16
-; CHECK: ret
 }
 
 ; We used to get stuck in type legalization for this example when lowering the
@@ -50,8 +39,9 @@ entry:
 ; condition operand and widening the resulting vselect for the v4f32 result.
 ; PR18036
 
-; CHECK-LABEL: vselect
 define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
+; CHECK-LABEL: vselect:
+; CHECK:         ret
 entry:
   %a1 = icmp eq <4 x i32> %q, zeroinitializer
   %a14 = select <4 x i1> %a1, <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+0> , <4 x float> zeroinitializer
diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll
deleted file mode 100644
index c63ff72..0000000
--- a/test/CodeGen/X86/sse2-blend.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
-
-; CHECK-LABEL: vsel_float
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NOT: orps
-; CHECK: ret
-define void@vsel_float(<4 x float>* %v1, <4 x float>* %v2) {
-  %A = load <4 x float>* %v1
-  %B = load <4 x float>* %v2
-  %vsel = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
-  store <4 x float > %vsel, <4 x float>* %v1
-  ret void
-}
-
-; CHECK-LABEL: vsel_i32
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NOT: orps
-; CHECK: ret
-define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
-  %A = load <4 x i32>* %v1
-  %B = load <4 x i32>* %v2
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B
-  store <4 x i32 > %vsel, <4 x i32>* %v1
-  ret void
-}
-
-; Without forcing instructions, fall back to the preferred PS domain.
-; CHECK-LABEL: vsel_i64
-; CHECK: andnps
-; CHECK: orps
-; CHECK: ret
-
-define void@vsel_i64(<2 x i64>* %v1, <2 x i64>* %v2) {
-  %A = load <2 x i64>* %v1
-  %B = load <2 x i64>* %v2
-  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %A, <2 x i64> %B
-  store <2 x i64 > %vsel, <2 x i64>* %v1
-  ret void
-}
-
-; Without forcing instructions, fall back to the preferred PS domain.
-; CHECK-LABEL: vsel_double
-; CHECK: andnps
-; CHECK: orps
-; CHECK: ret
-
-define void@vsel_double(<2 x double>* %v1, <2 x double>* %v2) {
-  %A = load <2 x double>* %v1
-  %B = load <2 x double>* %v2
-  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %A, <2 x double> %B
-  store <2 x double > %vsel, <2 x double>* %v1
-  ret void
-}
-
-
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index c906ecd..c4d9e6d 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -408,21 +408,21 @@ define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
-  ; CHECK: pslldq
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
-  ; CHECK: pslldq
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
+
+
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+  ; CHECK: pslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
+  ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
 declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
 
 
@@ -504,21 +504,21 @@ define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
-  ; CHECK: psrldq
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
-  ; CHECK: psrldq
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+  ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
+  ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
 declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
 
 
diff --git a/test/CodeGen/X86/sse2-mul.ll b/test/CodeGen/X86/sse2-mul.ll
deleted file mode 100644
index e066368..0000000
--- a/test/CodeGen/X86/sse2-mul.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s
-
-define <4 x i32> @test1(<4 x i32> %x, <4 x i32> %y) {
-  %m = mul <4 x i32> %x, %y
-  ret <4 x i32> %m
-; CHECK-LABEL: test1:
-; CHECK: pshufd $49
-; CHECK: pmuludq
-; CHECK: pshufd $49
-; CHECK: pmuludq
-; CHECK: shufps $-120
-; CHECK: pshufd $-40
-; CHECK: ret
-}
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index e8d3d6f..b7db6cb 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -2,39 +2,48 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
 
 define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
+; CHECK-LABEL: test1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movapd (%ecx), %xmm0
+; CHECK-NEXT:    movlpd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    movapd %xmm0, (%eax)
+; CHECK-NEXT:    retl
 	%tmp3 = load <2 x double>* %A, align 16
 	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
 	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
 	store <2 x double> %tmp9, <2 x double>* %r, align 16
 	ret void
-
-; CHECK-LABEL: test1:
-; CHECK: 	movl	4(%esp), %eax
-; CHECK-NEXT: 	movl	8(%esp), %ecx
-; CHECK-NEXT: 	movapd	(%ecx), %xmm0
-; CHECK-NEXT: 	movlpd	12(%esp), %xmm0
-; CHECK-NEXT: 	movapd	%xmm0, (%eax)
-; CHECK-NEXT: 	ret
 }
 
 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
+; CHECK-LABEL: test2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movapd (%ecx), %xmm0
+; CHECK-NEXT:    movhpd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    movapd %xmm0, (%eax)
+; CHECK-NEXT:    retl
 	%tmp3 = load <2 x double>* %A, align 16
 	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
 	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
 	store <2 x double> %tmp9, <2 x double>* %r, align 16
 	ret void
-
-; CHECK-LABEL: test2:
-; CHECK: 	movl	4(%esp), %eax
-; CHECK: 	movl	8(%esp), %ecx
-; CHECK-NEXT: 	movapd	(%ecx), %xmm0
-; CHECK-NEXT: 	movhpd	12(%esp), %xmm0
-; CHECK-NEXT: 	movapd	%xmm0, (%eax)
-; CHECK-NEXT: 	ret
 }
 
 
 define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movaps (%edx), %xmm0
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    movaps %xmm0, (%eax)
+; CHECK-NEXT:    retl
 	%tmp = load <4 x float>* %B		; <<4 x float>> [#uses=2]
 	%tmp3 = load <4 x float>* %A		; <<4 x float>> [#uses=2]
 	%tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0		; <float> [#uses=1]
@@ -47,24 +56,30 @@ define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind
 	%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3		; <<4 x float>> [#uses=1]
 	store <4 x float> %tmp13, <4 x float>* %res
 	ret void
-; CHECK: @test3
-; CHECK: 	unpcklps
 }
 
 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
+; CHECK-LABEL: test4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; CHECK-NEXT:    movaps %xmm0, (%eax)
+; CHECK-NEXT:    retl
 	%tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
 	store <4 x float> %tmp5, <4 x float>* %res
 	ret void
-; CHECK: @test4
-; CHECK: 	pshufd	$50, %xmm0, %xmm0
 }
 
 define <4 x i32> @test5(i8** %ptr) nounwind {
 ; CHECK-LABEL: test5:
-; CHECK: pxor
-; CHECK: punpcklbw
-; CHECK: punpcklwd
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    movss (%eax), %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    retl
 	%tmp = load i8** %ptr		; <i8*> [#uses=1]
 	%tmp.upgrd.1 = bitcast i8* %tmp to float*		; <float*> [#uses=1]
 	%tmp.upgrd.2 = load float* %tmp.upgrd.1		; <float> [#uses=1]
@@ -81,30 +96,39 @@ define <4 x i32> @test5(i8** %ptr) nounwind {
 }
 
 define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
-        %tmp1 = load <4 x float>* %A            ; <<4 x float>> [#uses=1]
-        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
-        store <4 x float> %tmp2, <4 x float>* %res
-        ret void
-
 ; CHECK-LABEL: test6:
-; CHECK: 	movaps	(%ecx), %xmm0
-; CHECK:	movaps	%xmm0, (%eax)
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movaps (%ecx), %xmm0
+; CHECK-NEXT:    movaps %xmm0, (%eax)
+; CHECK-NEXT:    retl
+  %tmp1 = load <4 x float>* %A            ; <<4 x float>> [#uses=1]
+  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
+  store <4 x float> %tmp2, <4 x float>* %res
+  ret void
 }
 
 define void @test7() nounwind {
-        bitcast <4 x i32> zeroinitializer to <4 x float>                ; <<4 x float>>:1 [#uses=1]
-        shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
-        store <4 x float> %2, <4 x float>* null
-        ret void
-
 ; CHECK-LABEL: test7:
-; CHECK:	xorps	%xmm0, %xmm0
-; CHECK:	movaps	%xmm0, 0
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movaps %xmm0, 0
+; CHECK-NEXT:    retl
+  bitcast <4 x i32> zeroinitializer to <4 x float>                ; <<4 x float>>:1 [#uses=1]
+  shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
+  store <4 x float> %2, <4 x float>* null
+  ret void
 }
 
 @x = external global [4 x i32]
 
 define <2 x i64> @test8() nounwind {
+; CHECK-LABEL: test8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl L_x$non_lazy_ptr, %eax
+; CHECK-NEXT:    movups (%eax), %xmm0
+; CHECK-NEXT:    retl
 	%tmp = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 0)		; <i32> [#uses=1]
 	%tmp3 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 1)		; <i32> [#uses=1]
 	%tmp5 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 2)		; <i32> [#uses=1]
@@ -115,90 +139,123 @@ define <2 x i64> @test8() nounwind {
 	%tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3		; <<4 x i32>> [#uses=1]
 	%tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64>		; <<2 x i64>> [#uses=1]
 	ret <2 x i64> %tmp16
-; CHECK-LABEL: test8:
-; CHECK: movups	(%eax), %xmm0
 }
 
 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
+; CHECK-LABEL: test9:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    retl
 	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
 	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
 	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
 	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
 	ret <4 x float> %tmp13
-; CHECK-LABEL: test9:
-; CHECK: movups	8(%esp), %xmm0
 }
 
 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
+; CHECK-LABEL: test10:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    retl
 	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
 	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
 	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
 	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
 	ret <4 x float> %tmp13
-; CHECK-LABEL: test10:
-; CHECK: movaps	4(%esp), %xmm0
 }
 
 define <2 x double> @test11(double %a, double %b) nounwind {
+; CHECK-LABEL: test11:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    retl
 	%tmp = insertelement <2 x double> undef, double %a, i32 0		; <<2 x double>> [#uses=1]
 	%tmp7 = insertelement <2 x double> %tmp, double %b, i32 1		; <<2 x double>> [#uses=1]
 	ret <2 x double> %tmp7
-; CHECK-LABEL: test11:
-; CHECK: movaps	4(%esp), %xmm0
 }
 
 define void @test12() nounwind {
-        %tmp1 = load <4 x float>* null          ; <<4 x float>> [#uses=2]
-        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
-        %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
-        %tmp4 = fadd <4 x float> %tmp2, %tmp3            ; <<4 x float>> [#uses=1]
-        store <4 x float> %tmp4, <4 x float>* null
-        ret void
 ; CHECK-LABEL: test12:
-; CHECK: movhlps
-; CHECK: shufps
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movapd 0, %xmm0
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; CHECK-NEXT:    movsd %xmm0, %xmm1
+; CHECK-NEXT:    xorpd %xmm2, %xmm2
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; CHECK-NEXT:    addps %xmm1, %xmm0
+; CHECK-NEXT:    movaps %xmm0, 0
+; CHECK-NEXT:    retl
+  %tmp1 = load <4 x float>* null          ; <<4 x float>> [#uses=2]
+  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
+  %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
+  %tmp4 = fadd <4 x float> %tmp2, %tmp3            ; <<4 x float>> [#uses=1]
+  store <4 x float> %tmp4, <4 x float>* null
+  ret void
 }
 
 define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
-        %tmp3 = load <4 x float>* %B            ; <<4 x float>> [#uses=1]
-        %tmp5 = load <4 x float>* %C            ; <<4 x float>> [#uses=1]
-        %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 >         ; <<4 x float>> [#uses=1]
-        store <4 x float> %tmp11, <4 x float>* %res
-        ret void
-; CHECK: test13
-; CHECK: shufps	$69, (%ecx), %xmm0
-; CHECK: pshufd	$-40, %xmm0, %xmm0
+; CHECK-LABEL: test13:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movaps (%edx), %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; CHECK-NEXT:    movaps %xmm0, (%eax)
+; CHECK-NEXT:    retl
+  %tmp3 = load <4 x float>* %B            ; <<4 x float>> [#uses=1]
+  %tmp5 = load <4 x float>* %C            ; <<4 x float>> [#uses=1]
+  %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 >         ; <<4 x float>> [#uses=1]
+  store <4 x float> %tmp11, <4 x float>* %res
+  ret void
 }
 
 define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
-        %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=2]
-        %tmp5 = load <4 x float>* %x            ; <<4 x float>> [#uses=2]
-        %tmp9 = fadd <4 x float> %tmp5, %tmp             ; <<4 x float>> [#uses=1]
-        %tmp21 = fsub <4 x float> %tmp5, %tmp            ; <<4 x float>> [#uses=1]
-        %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
-        ret <4 x float> %tmp27
 ; CHECK-LABEL: test14:
-; CHECK: 	addps	[[X1:%xmm[0-9]+]], [[X0:%xmm[0-9]+]]
-; CHECK: 	subps	[[X1]], [[X2:%xmm[0-9]+]]
-; CHECK: 	movlhps	[[X2]], [[X0]]
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movaps (%ecx), %xmm1
+; CHECK-NEXT:    movaps (%eax), %xmm2
+; CHECK-NEXT:    movaps %xmm2, %xmm0
+; CHECK-NEXT:    addps %xmm1, %xmm0
+; CHECK-NEXT:    subps %xmm1, %xmm2
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT:    retl
+  %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=2]
+  %tmp5 = load <4 x float>* %x            ; <<4 x float>> [#uses=2]
+  %tmp9 = fadd <4 x float> %tmp5, %tmp             ; <<4 x float>> [#uses=1]
+  %tmp21 = fsub <4 x float> %tmp5, %tmp            ; <<4 x float>> [#uses=1]
+  %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
+  ret <4 x float> %tmp27
 }
 
 define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
-entry:
-        %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=1]
-        %tmp3 = load <4 x float>* %x            ; <<4 x float>> [#uses=1]
-        %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >           ; <<4 x float>> [#uses=1]
-        ret <4 x float> %tmp4
 ; CHECK-LABEL: test15:
-; CHECK: 	movhlps	%xmm1, %xmm0
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movapd (%ecx), %xmm0
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-NEXT:    retl
+entry:
+  %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=1]
+  %tmp3 = load <4 x float>* %x            ; <<4 x float>> [#uses=1]
+  %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >           ; <<4 x float>> [#uses=1]
+  ret <4 x float> %tmp4
 }
 
 ; PR8900
-; CHECK-LABEL: test16:
-; CHECK: unpcklpd
-; CHECK: ret
 
 define  <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
+; CHECK-LABEL: test16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movapd 96(%eax), %xmm0
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    retl
   %i5 = getelementptr inbounds <4 x double>* %srcA, i32 3
   %i6 = load <4 x double>* %i5, align 32
   %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
@@ -207,6 +264,11 @@ define  <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocap
 
 ; PR9009
 define fastcc void @test17() nounwind {
+; CHECK-LABEL: test17:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
+; CHECK-NEXT:    movaps %xmm0, (%eax)
+; CHECK-NEXT:    retl
 entry:
   %0 = insertelement <4 x i32> undef, i32 undef, i32 1
   %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
@@ -217,25 +279,48 @@ entry:
 
 ; PR9210
 define <4 x float> @f(<4 x double>) nounwind {
+; CHECK-LABEL: f:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    cvtpd2ps %xmm1, %xmm1
+; CHECK-NEXT:    cvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retl
 entry:
  %double2float.i = fptrunc <4 x double> %0 to <4 x float>
  ret <4 x float> %double2float.i
 }
 
 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
-; CHECK-LABEL: test_insert_64_zext
-; CHECK-NOT: xor
-; CHECK: movq
+; CHECK-LABEL: test_insert_64_zext:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movq %xmm0, %xmm0
+; CHECK-NEXT:    retl
   %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %1
 }
 
 define <4 x i32> @PR19721(<4 x i32> %i) {
+; CHECK-LABEL: PR19721:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    movss %xmm1, %xmm0
+; CHECK-NEXT:    retl
   %bc = bitcast <4 x i32> %i to i128
   %insert = and i128 %bc, -4294967296
   %bc2 = bitcast i128 %insert to <4 x i32>
   ret <4 x i32> %bc2
+}
 
-; CHECK-LABEL: PR19721
-; CHECK: punpckldq
+define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_mul:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; CHECK-NEXT:    retl
+  %m = mul <4 x i32> %x, %y
+  ret <4 x i32> %m
 }
diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll
index b7706cc..5b2de28 100644
--- a/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
 
 
diff --git a/test/CodeGen/X86/sse3-avx-addsub.ll b/test/CodeGen/X86/sse3-avx-addsub.ll
index 8b66743..431588f 100644
--- a/test/CodeGen/X86/sse3-avx-addsub.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK
+; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK
 ; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX -check-prefix=CHECK
 
 ; Test ADDSUB ISel patterns.
@@ -141,156 +141,3 @@ define <2 x double> @test4b(<2 x double> %A, <2 x double>* %B) {
 ; AVX: vaddsubpd
 ; CHECK-NEXT: ret
 
-; Functions below are obtained from the following source:
-;
-; float4 test1(float4 A, float4 B) {
-;   float4 X = A + B;
-;   float4 Y = A - B;
-;   return (float4){X[0], Y[1], X[2], Y[3]};
-; }
-;
-; float8 test2(float8 A, float8 B) {
-;   float8 X = A + B;
-;   float8 Y = A - B;
-;   return (float8){X[0], Y[1], X[2], Y[3], X[4], Y[5], X[6], Y[7]};
-; }
-;
-; double4 test3(double4 A, double4 B) {
-;   double4 X = A + B;
-;   double4 Y = A - B;
-;   return (double4){X[0], Y[1], X[2], Y[3]};
-; }
-;
-; double2 test4(double2 A, double2 B) {
-;   double2 X = A + B;
-;   double2 Y = A - B;
-;   return (double2){X[0], Y[1]};
-; }
-
-define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
-  %sub = fsub <4 x float> %A, %B
-  %add = fadd <4 x float> %A, %B
-  %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x float> %vecinit6
-}
-; CHECK-LABEL: test5
-; SSE: xorps
-; SSE-NEXT: addsubps
-; AVX: vxorps
-; AVX-NEXT: vaddsubps
-; CHECK: ret
-
-
-define <8 x float> @test6(<8 x float> %A, <8 x float> %B) {
-  %sub = fsub <8 x float> %A, %B
-  %add = fadd <8 x float> %A, %B
-  %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
-  ret <8 x float> %vecinit14
-}
-; CHECK-LABEL: test6
-; SSE: xorps
-; SSE-NEXT: addsubps
-; SSE: xorps
-; SSE-NEXT: addsubps
-; AVX: vxorps
-; AVX-NEXT: vaddsubps
-; AVX-NOT: vxorps
-; AVX-NOT: vaddsubps
-; CHECK: ret
-
-
-define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
-  %sub = fsub <4 x double> %A, %B
-  %add = fadd <4 x double> %A, %B
-  %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x double> %vecinit6
-}
-; CHECK-LABEL: test7
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; AVX: vxorpd
-; AVX-NEXT: vaddsubpd
-; AVX-NOT: vxorpd
-; AVX-NOT: vaddsubpd
-; CHECK: ret
-
-
-define <2 x double> @test8(<2 x double> %A, <2 x double> %B) #0 {
-  %add = fadd <2 x double> %A, %B
-  %sub = fsub <2 x double> %A, %B
-  %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %vecinit2
-}
-; CHECK-LABEL: test8
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; AVX: vxorpd
-; AVX-NEXT: vaddsubpd
-; CHECK: ret
-
-
-define <4 x float> @test5b(<4 x float> %A, <4 x float> %B) {
-  %sub = fsub <4 x float> %A, %B
-  %add = fadd <4 x float> %B, %A
-  %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x float> %vecinit6
-}
-; CHECK-LABEL: test5
-; SSE: xorps
-; SSE-NEXT: addsubps
-; AVX: vxorps
-; AVX-NEXT: vaddsubps
-; CHECK: ret
-
-
-define <8 x float> @test6b(<8 x float> %A, <8 x float> %B) {
-  %sub = fsub <8 x float> %A, %B
-  %add = fadd <8 x float> %B, %A
-  %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
-  ret <8 x float> %vecinit14
-}
-; CHECK-LABEL: test6
-; SSE: xorps
-; SSE-NEXT: addsubps
-; SSE: xorps
-; SSE-NEXT: addsubps
-; AVX: vxorps
-; AVX-NEXT: vaddsubps
-; AVX-NOT: vxorps
-; AVX-NOT: vaddsubps
-; CHECK: ret
-
-
-define <4 x double> @test7b(<4 x double> %A, <4 x double> %B) {
-  %sub = fsub <4 x double> %A, %B
-  %add = fadd <4 x double> %B, %A
-  %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x double> %vecinit6
-}
-; CHECK-LABEL: test7
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; AVX: vxorpd
-; AVX-NEXT: vaddsubpd
-; AVX-NOT: vxorpd
-; AVX-NOT: vaddsubpd
-; CHECK: ret
-
-
-define <2 x double> @test8b(<2 x double> %A, <2 x double> %B) #0 {
-  %add = fadd <2 x double> %B, %A
-  %sub = fsub <2 x double> %A, %B
-  %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %vecinit2
-}
-; CHECK-LABEL: test8
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; AVX: vxorpd
-; AVX-NEXT: vaddsubpd
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 18bdcb3..0a5b0ca 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -1,99 +1,120 @@
 ; These are tests for SSE3 codegen.
 
-; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 \
-; RUN:              | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 | FileCheck %s --check-prefix=X64
 
 ; Test for v8xi16 lowering where we extract the first element of the vector and
 ; placed it in the second element of the result.
 
 define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind {
+; X64-LABEL: t0:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    movd %eax, %xmm0
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; X64-NEXT:    movdqa %xmm0, (%rdi)
+; X64-NEXT:    retq
 entry:
 	%tmp3 = load <8 x i16>* %old
 	%tmp6 = shufflevector <8 x i16> %tmp3,
-                <8 x i16> < i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >,
+                <8 x i16> < i16 1, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >,
                 <8 x i32> < i32 8, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef  >
 	store <8 x i16> %tmp6, <8 x i16>* %dest
 	ret void
-
-; X64-LABEL: t0:
-; X64:	movdqa	(%rsi), %xmm0
-; X64:	pslldq	$2, %xmm0
-; X64:	movdqa	%xmm0, (%rdi)
-; X64:	ret
 }
 
 define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+; X64-LABEL: t1:
+; X64:       ## BB#0:
+; X64-NEXT:    movdqa (%rdi), %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    retq
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
 	ret <8 x i16> %tmp3
 
-; X64-LABEL: t1:
-; X64: 	movdqa	(%rdi), %xmm0
-; X64: 	pinsrw	$0, (%rsi), %xmm0
-; X64: 	ret
 }
 
 define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t2:
+; X64:       ## BB#0:
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,0,3,4,5,6,7]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 >
 	ret <8 x i16> %tmp
-; X64-LABEL: t2:
-; X64:	pextrw	$1, %xmm1, %eax
-; X64:	pinsrw	$0, %eax, %xmm0
-; X64:	pinsrw	$3, %eax, %xmm0
-; X64:	ret
 }
 
 define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t3:
+; X64:       ## BB#0:
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 >
 	ret <8 x i16> %tmp
-; X64-LABEL: t3:
-; X64: 	pextrw	$5, %xmm0, %eax
-; X64: 	pshuflw	$44, %xmm0, %xmm0
-; X64: 	pshufhw	$27, %xmm0, %xmm0
-; X64: 	pinsrw	$3, %eax, %xmm0
-; X64: 	ret
 }
 
 define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t4:
+; X64:       ## BB#0:
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,4,7]
+; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 >
 	ret <8 x i16> %tmp
-; X64-LABEL: t4:
-; X64: 	pextrw	$7, [[XMM0:%xmm[0-9]+]], %eax
-; X64: 	pshufhw	$100, [[XMM0]], [[XMM1:%xmm[0-9]+]]
-; X64: 	pinsrw	$1, %eax, [[XMM1]]
-; X64: 	pextrw	$1, [[XMM0]], %eax
-; X64: 	pinsrw	$4, %eax, %xmm{{[0-9]}}
-; X64: 	ret
 }
 
 define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t5:
+; X64:       ## BB#0:
+; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT:    movdqa %xmm1, %xmm0
+; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 >
 	ret <8 x i16> %tmp
-; X64: 	t5:
-; X64: 		movlhps	%xmm1, %xmm0
-; X64: 		pshufd	$114, %xmm0, %xmm0
-; X64: 		ret
 }
 
 define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t6:
+; X64:       ## BB#0:
+; X64-NEXT:    movss %xmm1, %xmm0
+; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
 	ret <8 x i16> %tmp
-; X64: 	t6:
-; X64: 		movss	%xmm1, %xmm0
-; X64: 		ret
 }
 
 define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t7:
+; X64:       ## BB#0:
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
+; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 >
 	ret <8 x i16> %tmp
-; X64: 	t7:
-; X64: 		pshuflw	$-80, %xmm0, %xmm0
-; X64: 		pshufhw	$-56, %xmm0, %xmm0
-; X64: 		ret
 }
 
 define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {
+; X64-LABEL: t8:
+; X64:       ## BB#0:
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = mem[2,1,0,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; X64-NEXT:    movdqa %xmm0, (%rdi)
+; X64-NEXT:    retq
 	%tmp = load <2 x i64>* %A
 	%tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16>
 	%tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0
@@ -115,14 +136,15 @@ define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {
 	%tmp15.upgrd.2 = bitcast <8 x i16> %tmp15 to <2 x i64>
 	store <2 x i64> %tmp15.upgrd.2, <2 x i64>* %res
 	ret void
-; X64: 	t8:
-; X64: 		pshuflw	$-58, (%rsi), %xmm0
-; X64: 		pshufhw	$-58, %xmm0, %xmm0
-; X64: 		movdqa	%xmm0, (%rdi)
-; X64: 		ret
 }
 
 define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
+; X64-LABEL: t9:
+; X64:       ## BB#0:
+; X64-NEXT:    movapd (%rdi), %xmm0
+; X64-NEXT:    movhpd (%rsi), %xmm0
+; X64-NEXT:    movapd %xmm0, (%rdi)
+; X64-NEXT:    retq
 	%tmp = load <4 x float>* %r
 	%tmp.upgrd.3 = bitcast <2 x i32>* %A to double*
 	%tmp.upgrd.4 = load double* %tmp.upgrd.3
@@ -139,11 +161,6 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
 	%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3
 	store <4 x float> %tmp13, <4 x float>* %r
 	ret void
-; X64: 	t9:
-; X64: 		movaps	(%rdi), %xmm0
-; X64:	        movhps	(%rsi), %xmm0
-; X64:	        movaps	%xmm0, (%rdi)
-; X64: 		ret
 }
 
 
@@ -154,113 +171,121 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
 @g1 = external constant <4 x i32>
 @g2 = external constant <4 x i16>
 
-define internal void @t10() nounwind {
-        load <4 x i32>* @g1, align 16
-        bitcast <4 x i32> %1 to <8 x i16>
-        shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef >
-        bitcast <8 x i16> %3 to <2 x i64>
-        extractelement <2 x i64> %4, i32 0
-        bitcast i64 %5 to <4 x i16>
-        store <4 x i16> %6, <4 x i16>* @g2, align 8
-        ret void
-; X64: 	t10:
-; X64: 		pextrw	$4, [[X0:%xmm[0-9]+]], %e{{..}}
-; X64: 		pextrw	$6, [[X0]], %e{{..}}
-; X64: 		movlhps [[X0]], [[X0]]
-; X64: 		pshuflw	$8, [[X0]], [[X0]]
-; X64: 		pinsrw	$2, %e{{..}}, [[X0]]
-; X64: 		pinsrw	$3, %e{{..}}, [[X0]]
+define void @t10() nounwind {
+; X64-LABEL: t10:
+; X64:       ## BB#0:
+; X64-NEXT:    movq _g1@{{.*}}(%rip), %rax
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    movq _g2@{{.*}}(%rip), %rax
+; X64-NEXT:    movq %xmm0, (%rax)
+; X64-NEXT:    retq
+  load <4 x i32>* @g1, align 16
+  bitcast <4 x i32> %1 to <8 x i16>
+  shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef >
+  bitcast <8 x i16> %3 to <2 x i64>
+  extractelement <2 x i64> %4, i32 0
+  bitcast i64 %5 to <4 x i16>
+  store <4 x i16> %6, <4 x i16>* @g2, align 8
+  ret void
 }
 
-
 ; Pack various elements via shuffles.
 define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t11:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; X64-NEXT:    retq
 entry:
 	%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
 	ret <8 x i16> %tmp7
 
-; X64-LABEL: t11:
-; X64:	movd	%xmm1, %eax
-; X64:	movlhps	%xmm0, %xmm0
-; X64:	pshuflw	$1, %xmm0, %xmm0
-; X64:	pinsrw	$1, %eax, %xmm0
-; X64:	ret
 }
 
-
 define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t12:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
+; X64-NEXT:    retq
 entry:
 	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
 	ret <8 x i16> %tmp9
 
-; X64-LABEL: t12:
-; X64: 	pextrw	$3, %xmm1, %eax
-; X64: 	movlhps	%xmm0, %xmm0
-; X64: 	pshufhw	$3, %xmm0, %xmm0
-; X64: 	pinsrw	$5, %eax, %xmm0
-; X64: 	ret
 }
 
-
 define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t13:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
+; X64-NEXT:    retq
 entry:
 	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >
 	ret <8 x i16> %tmp9
-; X64-LABEL: t13:
-; X64: 	punpcklqdq	%xmm0, %xmm1
-; X64: 	pextrw	$3, %xmm1, %eax
-; X64: 	pshufhw	$12, %xmm1, %xmm0
-; X64: 	pinsrw	$4, %eax, %xmm0
-; X64: 	ret
 }
 
-
 define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t14:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; X64-NEXT:    retq
 entry:
 	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef >
 	ret <8 x i16> %tmp9
-; X64-LABEL: t14:
-; X64: 	punpcklqdq	%xmm0, %xmm1
-; X64: 	pshufhw	$8, %xmm1, %xmm0
-; X64: 	ret
 }
 
-
 ; FIXME: t15 is worse off from disabling of scheduler 2-address hack.
 define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t15:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; X64-NEXT:    retq
 entry:
-        %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
-        ret <8 x i16> %tmp8
-; X64: 	t15:
-; X64: 		pextrw	$7, %xmm0, %eax
-; X64: 		punpcklqdq	%xmm1, %xmm0
-; X64: 		pshuflw	$-128, %xmm0, %xmm0
-; X64: 		pinsrw	$2, %eax, %xmm0
-; X64: 		ret
+  %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
+  ret <8 x i16> %tmp8
 }
 
-
 ; Test yonah where we convert a shuffle to pextrw and pinrsw
 define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone {
+; X64-LABEL: t16:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT:    pxor %xmm2, %xmm2
+; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    packuswb %xmm0, %xmm0
+; X64-NEXT:    retq
 entry:
-        %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0,  i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
-        %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0,  <16 x i32> < i32 0, i32 1, i32 2, i32 17,  i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
-        ret <16 x i8> %tmp9
-; X64: 	t16:
-; X64: 		pextrw	$8, %xmm0, %eax
-; X64: 		pslldq	$2, %xmm0
-; X64: 		pextrw	$1, %xmm0, %ecx
-; X64: 		movzbl	%cl, %ecx
-; X64: 		orl	%eax, %ecx
-; X64: 		pinsrw	$1, %ecx, %xmm0
-; X64: 		ret
+  %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0,  i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
+  %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0,  <16 x i32> < i32 0, i32 1, i32 2, i32 17,  i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
+  ret <16 x i8> %tmp9
 }
 
 ; rdar://8520311
 define <4 x i32> @t17() nounwind {
-entry:
 ; X64-LABEL: t17:
-; X64:          movddup (%rax), %xmm0
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movddup (%rax), %xmm0
+; X64-NEXT:    andpd {{.*}}(%rip), %xmm0
+; X64-NEXT:    retq
+entry:
   %tmp1 = load <4 x float>* undef, align 16
   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   %tmp3 = load <4 x float>* undef, align 16
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
deleted file mode 100644
index 3a48121..0000000
--- a/test/CodeGen/X86/sse41-blend.ll
+++ /dev/null
@@ -1,140 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
-
-;CHECK-LABEL: vsel_float:
-;CHECK: blendps
-;CHECK: ret
-define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2
-  ret <4 x float> %vsel
-}
-
-
-;CHECK-LABEL: vsel_4xi8:
-;CHECK: blendps
-;CHECK: ret
-define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
-  ret <4 x i8> %vsel
-}
-
-;CHECK-LABEL: vsel_4xi16:
-;CHECK: blendps
-;CHECK: ret
-define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
-  ret <4 x i16> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i32:
-;CHECK: blendps
-;CHECK: ret
-define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
-  ret <4 x i32> %vsel
-}
-
-
-;CHECK-LABEL: vsel_double:
-;CHECK: movsd
-;CHECK: ret
-define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> %v1, <4 x double> %v2
-  ret <4 x double> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i64:
-;CHECK: movsd
-;CHECK: ret
-define <4 x i64> @vsel_i64(<4 x i64> %v1, <4 x i64> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %v1, <4 x i64> %v2
-  ret <4 x i64> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i8:
-;CHECK: pblendvb
-;CHECK: ret
-define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
-  %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
-  ret <16 x i8> %vsel
-}
-
-;; TEST blend + compares
-; CHECK: A
-define <2 x double> @A(<2 x double> %x, <2 x double> %y) {
-  ; CHECK: cmplepd
-  ; CHECK: blendvpd
-  %max_is_x = fcmp oge <2 x double> %x, %y
-  %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y
-  ret <2 x double> %max
-}
-
-; CHECK: B
-define <2 x double> @B(<2 x double> %x, <2 x double> %y) {
-  ; CHECK: cmpnlepd
-  ; CHECK: blendvpd
-  %min_is_x = fcmp ult <2 x double> %x, %y
-  %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
-  ret <2 x double> %min
-}
-
-; CHECK: float_crash
-define void @float_crash() nounwind {
-entry:
-  %merge205vector_func.i = select <4 x i1> undef, <4 x double> undef, <4 x double> undef
-  %extract214vector_func.i = extractelement <4 x double> %merge205vector_func.i, i32 0
-  store double %extract214vector_func.i, double addrspace(1)* undef, align 8
-  ret void
-}
-
-; If we can figure out a blend has a constant mask, we should emit the
-; blend instruction with an immediate mask
-define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
-; In this case, we emit a simple movss
-; CHECK-LABEL: constant_blendvpd
-; CHECK: movsd
-; CHECK: ret
-  %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %xy, <2 x double> %ab
-  ret <2 x double> %1
-}
-
-define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
-; CHECK-LABEL: constant_blendvps
-; CHECK-NOT: mov
-; CHECK: blendps $7
-; CHECK: ret
-  %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %xyzw, <4 x float> %abcd
-  ret <4 x float> %1
-}
-
-define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
-; CHECK-LABEL: constant_pblendvb:
-; CHECK: movaps
-; CHECK: pblendvb
-; CHECK: ret
-  %1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd
-  ret <16 x i8> %1
-}
-
-declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
-declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
-
-;; 2 tests for shufflevectors that optimize to blend + immediate
-; CHECK-LABEL: @blend_shufflevector_4xfloat
-; CHECK: blendps $6, %xmm1, %xmm0
-; CHECK: ret
-define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
-  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-  ret <4 x float> %1
-}
-
-; CHECK-LABEL: @blend_shufflevector_8xi16
-; CHECK: pblendw $134, %xmm1, %xmm0
-; CHECK: ret
-define <8 x i16> @blend_shufflevector_8xi16(<8 x i16> %a, <8 x i16> %b) {
-  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 15>
-  ret <8 x i16> %1
-}
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
new file mode 100644
index 0000000..6fab98e
--- /dev/null
+++ b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.1 | FileCheck %s
+; This test works just like the non-upgrade one except that it only checks
+; forms which require auto-upgrading.
+
+define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: blendpd
+  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: blendps
+  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: dppd
+  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: dpps
+  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: insertps
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+
+define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: mpsadbw
+  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pblendw
+  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
+
+
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll
index 37eff43..5f25a16 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll
@@ -2,18 +2,18 @@
 
 define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
   ; CHECK: blendpd
-  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
 
 
 define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: blendps
-  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 
 define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
@@ -34,35 +34,35 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
 
 define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
   ; CHECK: dppd
-  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
 
 
 define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: dpps
-  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 
 define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: insertps
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 
 
 define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: mpsadbw
-  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
-declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
 define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
@@ -83,10 +83,10 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun
 
 define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
   ; CHECK: pblendw
-  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
-declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
 
 
 define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 6726a3e..d5c6f74 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1,30 +1,47 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64
 
 @g16 = external global i16
 
 define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
-        %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
-        ret <4 x i32> %tmp1
 ; X32-LABEL: pinsrd_1:
-; X32:    pinsrd $1, 4(%esp), %xmm0
-
+; X32:       ## BB#0:
+; X32-NEXT:    pinsrd $1, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT:    retl
+;
 ; X64-LABEL: pinsrd_1:
-; X64:    pinsrd $1, %edi, %xmm0
+; X64:       ## BB#0:
+; X64-NEXT:    pinsrd $1, %edi, %xmm0
+; X64-NEXT:    retq
+  %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
+  ret <4 x i32> %tmp1
 }
 
 define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
-        %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
-        ret <16 x i8> %tmp1
 ; X32-LABEL: pinsrb_1:
-; X32:    pinsrb $1, 4(%esp), %xmm0
-
+; X32:       ## BB#0:
+; X32-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT:    retl
+;
 ; X64-LABEL: pinsrb_1:
-; X64:    pinsrb $1, %edi, %xmm0
+; X64:       ## BB#0:
+; X64-NEXT:    pinsrb $1, %edi, %xmm0
+; X64-NEXT:    retq
+  %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
+  ret <16 x i8> %tmp1
 }
 
-
 define <2 x i64> @pmovsxbd_1(i32* %p) nounwind {
+; X32-LABEL: pmovsxbd_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pmovsxbd (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: pmovsxbd_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    pmovsxbd (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
 	%0 = load i32* %p, align 4
 	%1 = insertelement <4 x i32> undef, i32 %0, i32 0
@@ -35,16 +52,19 @@ entry:
 	%6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone
 	%7 = bitcast <4 x i32> %6 to <2 x i64>
 	ret <2 x i64> %7
-        
-; X32: _pmovsxbd_1:
-; X32:   movl      4(%esp), %eax
-; X32:   pmovsxbd   (%eax), %xmm0
-
-; X64: _pmovsxbd_1:
-; X64:   pmovsxbd   (%rdi), %xmm0
 }
 
 define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly {
+; X32-LABEL: pmovsxwd_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pmovsxwd (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: pmovsxwd_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    pmovsxwd (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
 	%0 = load i64* %p		; <i64> [#uses=1]
 	%tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0		; <<2 x i64>> [#uses=1]
@@ -52,63 +72,59 @@ entry:
 	%2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone		; <<4 x i32>> [#uses=1]
 	%3 = bitcast <4 x i32> %2 to <2 x i64>		; <<2 x i64>> [#uses=1]
 	ret <2 x i64> %3
-        
-; X32: _pmovsxwd_1:
-; X32:   movl 4(%esp), %eax
-; X32:   pmovsxwd (%eax), %xmm0
-
-; X64: _pmovsxwd_1:
-; X64:   pmovsxwd (%rdi), %xmm0
 }
 
-
-
-
 define <2 x i64> @pmovzxbq_1() nounwind {
+; X32-LABEL: pmovzxbq_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl L_g16$non_lazy_ptr, %eax
+; X32-NEXT:    pmovzxbq (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: pmovzxbq_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movq _g16@{{.*}}(%rip), %rax
+; X64-NEXT:    pmovzxbq (%rax), %xmm0
+; X64-NEXT:    retq
 entry:
 	%0 = load i16* @g16, align 2		; <i16> [#uses=1]
 	%1 = insertelement <8 x i16> undef, i16 %0, i32 0		; <<8 x i16>> [#uses=1]
 	%2 = bitcast <8 x i16> %1 to <16 x i8>		; <<16 x i8>> [#uses=1]
 	%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone		; <<2 x i64>> [#uses=1]
 	ret <2 x i64> %3
-
-; X32: _pmovzxbq_1:
-; X32:   movl	L_g16$non_lazy_ptr, %eax
-; X32:   pmovzxbq	(%eax), %xmm0
-
-; X64: _pmovzxbq_1:
-; X64:   movq	_g16@GOTPCREL(%rip), %rax
-; X64:   pmovzxbq	(%rax), %xmm0
 }
 
 declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
 
-
-
-
 define i32 @extractps_1(<4 x float> %v) nounwind {
+; X32-LABEL: extractps_1:
+; X32:       ## BB#0:
+; X32-NEXT:    extractps $3, %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: extractps_1:
+; X64:       ## BB#0:
+; X64-NEXT:    extractps $3, %xmm0, %eax
+; X64-NEXT:    retq
   %s = extractelement <4 x float> %v, i32 3
   %i = bitcast float %s to i32
   ret i32 %i
-
-; X32: _extractps_1:  
-; X32:	  extractps	$3, %xmm0, %eax
-
-; X64: _extractps_1:  
-; X64:	  extractps	$3, %xmm0, %eax
 }
 define i32 @extractps_2(<4 x float> %v) nounwind {
+; X32-LABEL: extractps_2:
+; X32:       ## BB#0:
+; X32-NEXT:    extractps $3, %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: extractps_2:
+; X64:       ## BB#0:
+; X64-NEXT:    extractps $3, %xmm0, %eax
+; X64-NEXT:    retq
   %t = bitcast <4 x float> %v to <4 x i32>
   %s = extractelement <4 x i32> %t, i32 3
   ret i32 %s
-
-; X32: _extractps_2:
-; X32:	  extractps	$3, %xmm0, %eax
-
-; X64: _extractps_2:
-; X64:	  extractps	$3, %xmm0, %eax
 }
 
 
@@ -117,106 +133,152 @@ define i32 @extractps_2(<4 x float> %v) nounwind {
 ; is bitcasted to i32, but unsuitable for much of anything else.
 
 define float @ext_1(<4 x float> %v) nounwind {
+; X32-LABEL: ext_1:
+; X32:       ## BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-NEXT:    addss LCPI7_0, %xmm0
+; X32-NEXT:    movss %xmm0, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: ext_1:
+; X64:       ## BB#0:
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-NEXT:    addss {{.*}}(%rip), %xmm0
+; X64-NEXT:    retq
   %s = extractelement <4 x float> %v, i32 3
   %t = fadd float %s, 1.0
   ret float %t
-
-; X32: _ext_1:
-; X32:	  pshufd	$3, %xmm0, %xmm0
-; X32:	  addss	LCPI7_0, %xmm0
-
-; X64: _ext_1:
-; X64:	  pshufd	$3, %xmm0, %xmm0
-; X64:	  addss	LCPI7_0(%rip), %xmm0
 }
 define float @ext_2(<4 x float> %v) nounwind {
+; X32-LABEL: ext_2:
+; X32:       ## BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-NEXT:    movss %xmm0, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: ext_2:
+; X64:       ## BB#0:
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-NEXT:    retq
   %s = extractelement <4 x float> %v, i32 3
   ret float %s
-
-; X32: _ext_2:
-; X32:	  pshufd	$3, %xmm0, %xmm0
-
-; X64: _ext_2:
-; X64:	  pshufd	$3, %xmm0, %xmm0
 }
 define i32 @ext_3(<4 x i32> %v) nounwind {
+; X32-LABEL: ext_3:
+; X32:       ## BB#0:
+; X32-NEXT:    pextrd $3, %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: ext_3:
+; X64:       ## BB#0:
+; X64-NEXT:    pextrd $3, %xmm0, %eax
+; X64-NEXT:    retq
   %i = extractelement <4 x i32> %v, i32 3
   ret i32 %i
-
-; X32: _ext_3:
-; X32:	  pextrd	$3, %xmm0, %eax
-
-; X64: _ext_3:
-; X64:	  pextrd	$3, %xmm0, %eax
 }
 
 define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
-        %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
-        ret <4 x float> %tmp1
-; X32: _insertps_1:
-; X32:    insertps  $1, %xmm1, %xmm0
-
-; X64: _insertps_1:
-; X64:    insertps  $1, %xmm1, %xmm0
+; X32-LABEL: insertps_1:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_1:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
+; X64-NEXT:    retq
+  %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
+  ret <4 x float> %tmp1
 }
 
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
 
 define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
-        %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
-        ret <4 x float> %tmp1
-; X32: _insertps_2:
-; X32:    insertps  $0, 4(%esp), %xmm0
-
-; X64: _insertps_2:
-; X64:    insertps  $0, %xmm1, %xmm0        
+; X32-LABEL: insertps_2:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps $0, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_2:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
+  ret <4 x float> %tmp1
 }
-
 define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
-        %tmp2 = extractelement <4 x float> %t2, i32 0
-        %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
-        ret <4 x float> %tmp1
-; X32: _insertps_3:
-; X32:    insertps  $0, %xmm1, %xmm0        
-
-; X64: _insertps_3:
-; X64:    insertps  $0, %xmm1, %xmm0        
+; X32-LABEL: insertps_3:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_3:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %tmp2 = extractelement <4 x float> %t2, i32 0
+  %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
+  ret <4 x float> %tmp1
 }
 
 define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
-        %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
-        ret i32 %tmp1
-; X32: _ptestz_1:
-; X32:    ptest 	%xmm1, %xmm0
-; X32:    sete	%al
-
-; X64: _ptestz_1:
-; X64:    ptest 	%xmm1, %xmm0
-; X64:    sete	%al
+; X32-LABEL: ptestz_1:
+; X32:       ## BB#0:
+; X32-NEXT:    ptest %xmm1, %xmm0
+; X32-NEXT:    sete %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: ptestz_1:
+; X64:       ## BB#0:
+; X64-NEXT:    ptest %xmm1, %xmm0
+; X64-NEXT:    sete %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
+  ret i32 %tmp1
 }
 
 define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
-        %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
-        ret i32 %tmp1
-; X32: _ptestz_2:
-; X32:    ptest 	%xmm1, %xmm0
-; X32:    sbbl	%eax
-
-; X64: _ptestz_2:
-; X64:    ptest 	%xmm1, %xmm0
-; X64:    sbbl	%eax
+; X32-LABEL: ptestz_2:
+; X32:       ## BB#0:
+; X32-NEXT:    ptest %xmm1, %xmm0
+; X32-NEXT:    sbbl %eax, %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: ptestz_2:
+; X64:       ## BB#0:
+; X64-NEXT:    ptest %xmm1, %xmm0
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    retq
+  %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
+  ret i32 %tmp1
 }
 
 define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
-        %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
-        ret i32 %tmp1
-; X32: _ptestz_3:
-; X32:    ptest 	%xmm1, %xmm0
-; X32:    seta	%al
-
-; X64: _ptestz_3:
-; X64:    ptest 	%xmm1, %xmm0
-; X64:    seta	%al
+; X32-LABEL: ptestz_3:
+; X32:       ## BB#0:
+; X32-NEXT:    ptest %xmm1, %xmm0
+; X32-NEXT:    seta %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: ptestz_3:
+; X64:       ## BB#0:
+; X64-NEXT:    ptest %xmm1, %xmm0
+; X64-NEXT:    seta %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
+  ret i32 %tmp1
 }
 
 
@@ -227,6 +289,25 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
 ; This used to compile to insertps $0  + insertps $16.  insertps $0 is always
 ; pointless.
 define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
+; X32-LABEL: buildvector:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movaps %xmm0, %xmm2
+; X32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; X32-NEXT:    addss %xmm1, %xmm0
+; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X32-NEXT:    addss %xmm2, %xmm1
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: buildvector:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movaps %xmm0, %xmm2
+; X64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; X64-NEXT:    addss %xmm1, %xmm0
+; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT:    addss %xmm2, %xmm1
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X64-NEXT:    retq
 entry:
   %tmp7 = extractelement <2 x float> %A, i32 0
   %tmp5 = extractelement <2 x float> %A, i32 1
@@ -237,97 +318,124 @@ entry:
   %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
   %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
   ret <2 x float> %tmp9
-; X32-LABEL: buildvector:
-; X32-NOT: insertps $0
-; X32: insertps $16
-; X32-NOT: insertps $0
-; X32: ret
-; X64-LABEL: buildvector:
-; X64-NOT: insertps $0
-; X64: insertps $16
-; X64-NOT: insertps $0
-; X64: ret
 }
 
 define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; X32-LABEL: insertps_from_shufflevector_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    insertps $48, (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_shufflevector_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    insertps $48, (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %0 = load <4 x float>* %pb, align 16
   %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x float> %vecinit6
-; CHECK-LABEL: insertps_from_shufflevector_1:
-; CHECK-NOT: movss
-; CHECK-NOT: shufps
-; CHECK: insertps    $48,
-; CHECK: ret
 }
 
 define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
+; X32-LABEL: insertps_from_shufflevector_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_shufflevector_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X64-NEXT:    retq
 entry:
   %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
   ret <4 x float> %vecinit6
-; CHECK-LABEL: insertps_from_shufflevector_2:
-; CHECK-NOT: shufps
-; CHECK: insertps    $96,
-; CHECK: ret
 }
 
 ; For loading an i32 from memory into an xmm register we use pinsrd
 ; instead of insertps
 define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
+; X32-LABEL: pinsrd_from_shufflevector_i32:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    insertps $48, (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: pinsrd_from_shufflevector_i32:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    insertps $48, (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %0 = load <4 x i32>* %pb, align 16
   %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i32> %vecinit6
-; CHECK-LABEL: pinsrd_from_shufflevector_i32:
-; CHECK-NOT: movss
-; CHECK-NOT: shufps
-; CHECK: pinsrd  $3,
-; CHECK: ret
 }
 
 define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
+; X32-LABEL: insertps_from_shufflevector_i32_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[3],xmm0[2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_shufflevector_i32_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[3],xmm0[2,3]
+; X64-NEXT:    retq
 entry:
   %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
   ret <4 x i32> %vecinit6
-; CHECK-LABEL: insertps_from_shufflevector_i32_2:
-; CHECK-NOT: shufps
-; CHECK-NOT: movaps
-; CHECK: insertps    $208,
-; CHECK: ret
 }
 
 define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
-; CHECK-LABEL: insertps_from_load_ins_elt_undef:
-; CHECK-NOT: movss
-; CHECK-NOT: shufps
-; CHECK: insertps    $16,
-; CHECK: ret
+; X32-LABEL: insertps_from_load_ins_elt_undef:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    insertps $16, (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_load_ins_elt_undef:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps $16, (%rdi), %xmm0
+; X64-NEXT:    retq
   %1 = load float* %b, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
   ret <4 x float> %result
 }
 
-define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
-; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
 ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
-;; aCHECK-NOT: movd
-; CHECK-NOT: shufps
-; CHECK: insertps    $32,
-; CHECK: ret
+define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
+; X32-LABEL: insertps_from_load_ins_elt_undef_i32:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movd (%eax), %xmm1
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_load_ins_elt_undef_i32:
+; X64:       ## BB#0:
+; X64-NEXT:    movd (%rdi), %xmm1
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X64-NEXT:    retq
   %1 = load i32* %b, align 4
   %2 = insertelement <4 x i32> undef, i32 %1, i32 0
   %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
   ret <4 x i32> %result
 }
 
-;;;;;; Shuffles optimizable with a single insertps instruction
+;;;;;; Shuffles optimizable with a single insertps or blend instruction
 define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_XYZ0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $8
-; CHECK: ret
+; X32-LABEL: shuf_XYZ0:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm1, %xmm1
+; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_XYZ0:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecext1 = extractelement <4 x float> %x, i32 1
@@ -339,11 +447,15 @@ define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
 }
 
 define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_XY00:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $12
-; CHECK: ret
+; X32-LABEL: shuf_XY00:
+; X32:       ## BB#0:
+; X32-NEXT:    movq %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_XY00:
+; X64:       ## BB#0:
+; X64-NEXT:    movq %xmm0, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecext1 = extractelement <4 x float> %x, i32 1
@@ -354,11 +466,15 @@ define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
 }
 
 define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_XYY0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $104
-; CHECK: ret
+; X32-LABEL: shuf_XYY0:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_XYY0:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecext1 = extractelement <4 x float> %x, i32 1
@@ -369,9 +485,15 @@ define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
 }
 
 define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_XYW0:
-; CHECK: insertps    $232
-; CHECK: ret
+; X32-LABEL: shuf_XYW0:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_XYW0:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecext1 = extractelement <4 x float> %x, i32 1
@@ -383,11 +505,15 @@ define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
 }
 
 define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_W00W:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $198
-; CHECK: ret
+; X32-LABEL: shuf_W00W:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_W00W:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 3
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
@@ -397,11 +523,19 @@ define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
 }
 
 define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_X00A:
-; CHECK-NOT: movaps
-; CHECK-NOT: shufps
-; CHECK: insertps    $48
-; CHECK: ret
+; X32-LABEL: shuf_X00A:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm2, %xmm2
+; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_X00A:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
@@ -411,11 +545,21 @@ define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
 }
 
 define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_X00X:
-; CHECK-NOT: movaps
-; CHECK-NOT: shufps
-; CHECK: insertps    $48
-; CHECK: ret
+; X32-LABEL: shuf_X00X:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm1, %xmm1
+; X32-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,zero,xmm0[0]
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_X00X:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,zero,xmm0[0]
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
@@ -425,12 +569,23 @@ define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
 }
 
 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_X0YC:
-; CHECK: shufps
-; CHECK-NOT: movhlps
-; CHECK-NOT: shufps
-; CHECK: insertps    $176
-; CHECK: ret
+; X32-LABEL: shuf_X0YC:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm2, %xmm2
+; X32-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0],zero,xmm0[1],zero
+; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
+; X32-NEXT:    movaps %xmm2, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_X0YC:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0],zero,xmm0[1],zero
+; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
+; X64-NEXT:    movaps %xmm2, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
@@ -440,11 +595,17 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
 }
 
 define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_XYZ0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $8
-; CHECK: ret
+; X32-LABEL: i32_shuf_XYZ0:
+; X32:       ## BB#0:
+; X32-NEXT:    pxor %xmm1, %xmm1
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_XYZ0:
+; X64:       ## BB#0:
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecext1 = extractelement <4 x i32> %x, i32 1
@@ -456,11 +617,15 @@ define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
 }
 
 define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_XY00:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $12
-; CHECK: ret
+; X32-LABEL: i32_shuf_XY00:
+; X32:       ## BB#0:
+; X32-NEXT:    movq %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_XY00:
+; X64:       ## BB#0:
+; X64-NEXT:    movq %xmm0, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecext1 = extractelement <4 x i32> %x, i32 1
@@ -471,11 +636,15 @@ define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
 }
 
 define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_XYY0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $104
-; CHECK: ret
+; X32-LABEL: i32_shuf_XYY0:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_XYY0:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecext1 = extractelement <4 x i32> %x, i32 1
@@ -486,11 +655,15 @@ define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
 }
 
 define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_XYW0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $232
-; CHECK: ret
+; X32-LABEL: i32_shuf_XYW0:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_XYW0:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecext1 = extractelement <4 x i32> %x, i32 1
@@ -502,11 +675,15 @@ define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
 }
 
 define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_W00W:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $198
-; CHECK: ret
+; X32-LABEL: i32_shuf_W00W:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_W00W:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 3
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -516,11 +693,19 @@ define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
 }
 
 define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_X00A:
-; CHECK-NOT: movaps
-; CHECK-NOT: shufps
-; CHECK: insertps    $48
-; CHECK: ret
+; X32-LABEL: i32_shuf_X00A:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm2, %xmm2
+; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_X00A:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -530,11 +715,21 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
 }
 
 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_X00X:
-; CHECK-NOT: movaps
-; CHECK-NOT: shufps
-; CHECK: insertps    $48
-; CHECK: ret
+; X32-LABEL: i32_shuf_X00X:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm1, %xmm1
+; X32-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_X00X:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -544,12 +739,23 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
 }
 
 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_X0YC:
-; CHECK: shufps
-; CHECK-NOT: movhlps
-; CHECK-NOT: shufps
-; CHECK: insertps    $176
-; CHECK: ret
+; X32-LABEL: i32_shuf_X0YC:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm2, %xmm2
+; X32-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero
+; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
+; X32-NEXT:    movaps %xmm2, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_X0YC:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero
+; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
+; X64-NEXT:    movaps %xmm2, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -560,11 +766,19 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
 
 ;; Test for a bug in the first implementation of LowerBuildVectorv4x32
 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
-; CHECK-LABEL: test_insertps_no_undef:
-; CHECK: movaps  %xmm0, %xmm1
-; CHECK-NEXT: insertps        $8, %xmm1, %xmm1
-; CHECK-NEXT: maxps   %xmm1, %xmm0
-; CHECK-NEXT: ret
+; X32-LABEL: test_insertps_no_undef:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm1, %xmm1
+; X32-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
+; X32-NEXT:    maxps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_insertps_no_undef:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
+; X64-NEXT:    maxps %xmm1, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecext1 = extractelement <4 x float> %x, i32 1
@@ -578,48 +792,75 @@ define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
 }
 
 define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: blendvb_fallback
-; CHECK: blendvb
-; CHECK: ret
+; X32-LABEL: blendvb_fallback:
+; X32:       ## BB#0:
+; X32-NEXT:    psllw $15, %xmm0
+; X32-NEXT:    psraw $15, %xmm0
+; X32-NEXT:    pblendvb %xmm1, %xmm2
+; X32-NEXT:    movdqa %xmm2, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: blendvb_fallback:
+; X64:       ## BB#0:
+; X64-NEXT:    psllw $15, %xmm0
+; X64-NEXT:    psraw $15, %xmm0
+; X64-NEXT:    pblendvb %xmm1, %xmm2
+; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    retq
   %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
   ret <8 x i16> %ret
 }
 
-define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
-; CHECK-LABEL: insertps_from_vector_load:
 ; On X32, account for the argument's move to registers
-; X32: movl    4(%esp), %eax
-; CHECK-NOT: mov
-; CHECK: insertps    $48
-; CHECK-NEXT: ret
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; X32-LABEL: insertps_from_vector_load:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    insertps $48, (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_vector_load:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps $48, (%rdi), %xmm0
+; X64-NEXT:    retq
   %1 = load <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
   ret <4 x float> %2
 }
 
 ;; Use a non-zero CountS for insertps
-define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
-; CHECK-LABEL: insertps_from_vector_load_offset:
-; On X32, account for the argument's move to registers
-; X32: movl    4(%esp), %eax
-; CHECK-NOT: mov
 ;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: insertps    $96, 4(%{{...}}), %
-; CHECK-NEXT: ret
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; X32-LABEL: insertps_from_vector_load_offset:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    insertps $96, 4(%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_vector_load_offset:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps $96, 4(%rdi), %xmm0
+; X64-NEXT:    retq
   %1 = load <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
   ret <4 x float> %2
 }
 
-define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
-; CHECK-LABEL: insertps_from_vector_load_offset_2:
-; On X32, account for the argument's move to registers
-; X32: movl    4(%esp), %eax
-; X32: movl    8(%esp), %ecx
-; CHECK-NOT: mov
 ;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: insertps    $192, 12(%{{...}},%{{...}}), %
-; CHECK-NEXT: ret
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; X32-LABEL: insertps_from_vector_load_offset_2:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    shll $4, %ecx
+; X32-NEXT:    insertps $-64, 12(%eax,%ecx), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_vector_load_offset_2:
+; X64:       ## BB#0:
+; X64-NEXT:    shlq $4, %rsi
+; X64-NEXT:    insertps $-64, 12(%rdi,%rsi), %xmm0
+; X64-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
   %2 = load <4 x float>* %1, align 16
   %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
@@ -627,13 +868,21 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
 }
 
 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
-; CHECK-LABEL: insertps_from_broadcast_loadf32:
-; On X32, account for the arguments' move to registers
-; X32: movl    8(%esp), %eax
-; X32: movl    4(%esp), %ecx
-; CHECK-NOT: mov
-; CHECK: insertps    $48
-; CHECK-NEXT: ret
+; X32-LABEL: insertps_from_broadcast_loadf32:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movss (%ecx,%eax,4), %xmm1
+; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_broadcast_loadf32:
+; X64:       ## BB#0:
+; X64-NEXT:    movss (%rdi,%rsi,4), %xmm1
+; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT:    retq
   %1 = getelementptr inbounds float* %fb, i64 %index
   %2 = load float* %1, align 4
   %3 = insertelement <4 x float> undef, float %2, i32 0
@@ -645,12 +894,20 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
 }
 
 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
-; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
-; On X32, account for the arguments' move to registers
-; X32: movl    4(%esp), %{{...}}
-; CHECK-NOT: mov
-; CHECK: insertps    $48
-; CHECK-NEXT: ret
+; X32-LABEL: insertps_from_broadcast_loadv4f32:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movups (%eax), %xmm1
+; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_broadcast_loadv4f32:
+; X64:       ## BB#0:
+; X64-NEXT:    movups (%rdi), %xmm1
+; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT:    retq
   %1 = load <4 x float>* %b, align 4
   %2 = extractelement <4 x float> %1, i32 0
   %3 = insertelement <4 x float> undef, float %2, i32 0
@@ -663,20 +920,33 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
 
 ;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
-; CHECK-LABEL: insertps_from_broadcast_multiple_use:
-; On X32, account for the arguments' move to registers
-; X32: movl    8(%esp), %eax
-; X32: movl    4(%esp), %ecx
-; CHECK: movss
-; CHECK-NOT: mov
-; CHECK: insertps    $48
-; CHECK: insertps    $48
-; CHECK: insertps    $48
-; CHECK: insertps    $48
-; CHECK: addps
-; CHECK: addps
-; CHECK: addps
-; CHECK-NEXT: ret
+; X32-LABEL: insertps_from_broadcast_multiple_use:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movss (%ecx,%eax,4), %xmm4
+; X32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
+; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
+; X32-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; X32-NEXT:    addps %xmm1, %xmm0
+; X32-NEXT:    addps %xmm2, %xmm3
+; X32-NEXT:    addps %xmm3, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_broadcast_multiple_use:
+; X64:       ## BB#0:
+; X64-NEXT:    movss (%rdi,%rsi,4), %xmm4
+; X64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
+; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
+; X64-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; X64-NEXT:    addps %xmm1, %xmm0
+; X64-NEXT:    addps %xmm2, %xmm3
+; X64-NEXT:    addps %xmm3, %xmm0
+; X64-NEXT:    retq
   %1 = getelementptr inbounds float* %fb, i64 %index
   %2 = load float* %1, align 4
   %3 = insertelement <4 x float> undef, float %2, i32 0
@@ -694,10 +964,20 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 }
 
 define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
-; CHECK-LABEL: insertps_with_undefs:
-; CHECK-NOT: shufps
-; CHECK: insertps    $32, %xmm0
-; CHECK: ret
+; X32-LABEL: insertps_with_undefs:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movss (%eax), %xmm1
+; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3]
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_with_undefs:
+; X64:       ## BB#0:
+; X64-NEXT:    movss (%rdi), %xmm1
+; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3]
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
   %1 = load float* %b, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
@@ -707,10 +987,162 @@ define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
 ; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
 ; the destination index to change the load, instead of the source index.
 define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
-; CHECK-LABEL: pr20087:
-; CHECK: insertps  $48
-; CHECK: ret
+; X32-LABEL: pr20087:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    insertps $-78, 8(%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: pr20087:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps $-78, 8(%rdi), %xmm0
+; X64-NEXT:    retq
   %load = load <4 x float> *%ptr
   %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
   ret <4 x float> %ret
 }
+
+; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
+define void @insertps_pr20411(i32* noalias nocapture %RET) #1 {
+; X32-LABEL: insertps_pr20411:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; X32-NEXT:    insertps $-36, LCPI49_1+12, %xmm0
+; X32-NEXT:    movups %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_pr20411:
+; X64:       ## BB#0:
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; X64-NEXT:    insertps $-36, LCPI49_1+{{.*}}(%rip), %xmm0
+; X64-NEXT:    movups %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %gather_load = shufflevector <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shuffle109 = shufflevector <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>  ; 4 5 6 7
+  %shuffle116 = shufflevector <8 x i32> %gather_load, <8 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; 3 x x x
+  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 4, i32 3, i32 undef, i32 undef> ; 3 7 x x
+  %ptrcast = bitcast i32* %RET to <4 x i32>*
+  store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
+  ret void
+}
+
+define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_4:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_4:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+  %vecext2 = extractelement <4 x float> %B, i32 2
+  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_5:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_5:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %B, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_6:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_6:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 1
+  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
+  %vecext1 = extractelement <4 x float> %B, i32 2
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit3
+}
+
+define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_7:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_7:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+  %vecext2 = extractelement <4 x float> %B, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_8:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_8:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %B, i32 0
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_9:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_9:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
+  %vecext1 = extractelement <4 x float> %B, i32 2
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit3
+}
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index cf88ade..cf0f999 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -10,88 +10,88 @@
 ; Function Attrs: nounwind sspreq
 define i32 @_Z18read_response_sizev() #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23), !dbg !39
+  tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !39
   %0 = load i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0), align 8, !dbg !40
-  tail call void @llvm.dbg.value(metadata !63, i64 0, metadata !64), !dbg !71
+  tail call void @llvm.dbg.value(metadata !63, i64 0, metadata !64, metadata !{metadata !"0x102"}), !dbg !71
   %1 = trunc i64 %0 to i32
   ret i32 %1
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 attributes #0 = { sspreq }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21, !72}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !5, metadata !8, metadata !20, metadata !5, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \001\00\000\00\001", metadata !1, metadata !2, metadata !5, metadata !8, metadata !20, metadata !5} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<unknown>", metadata !"/Users/matt/ryan_bug"}
 !2 = metadata !{metadata !3}
-!3 = metadata !{i32 786436, metadata !1, metadata !4, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 19, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ]
+!3 = metadata !{metadata !"0x4\00\0020\0032\0032\000\000\000", metadata !1, metadata !4, null, metadata !6, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00C\0019\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{}
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 786472, metadata !"max_frame_size", i64 0} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
+!7 = metadata !{metadata !"0x28\00max_frame_size\000"} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
 !8 = metadata !{metadata !9, metadata !24, metadata !41, metadata !65}
-!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"read_response_size", metadata !"read_response_size", metadata !"_Z18read_response_sizev", i32 27, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z18read_response_sizev, null, null, metadata !14, i32 27} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
-!10 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x2e\00read_response_size\00read_response_size\00_Z18read_response_sizev\0027\000\001\000\006\00256\001\0027", metadata !1, metadata !10, metadata !11, null, i32 ()* @_Z18read_response_sizev, null, null, metadata !14} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
+!10 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !14 = metadata !{metadata !15, metadata !19}
-!15 = metadata !{i32 786688, metadata !9, metadata !"b", metadata !10, i32 28, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 28]
-!16 = metadata !{i32 786451, metadata !1, null, metadata !"B", i32 16, i64 32, i64 32, i32 0, i32 0, null, metadata !17, i32 0, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ]
+!15 = metadata !{metadata !"0x100\00b\0028\000", metadata !9, metadata !10, metadata !16} ; [ DW_TAG_auto_variable ] [b] [line 28]
+!16 = metadata !{metadata !"0x13\00B\0016\0032\0032\000\000\000", metadata !1, null, null, metadata !17, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ]
 !17 = metadata !{metadata !18}
-!18 = metadata !{i32 786445, metadata !1, metadata !16, metadata !"end_of_file", i32 17, i64 32, i64 32, i64 0, i32 0, metadata !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int]
-!19 = metadata !{i32 786688, metadata !9, metadata !"c", metadata !10, i32 29, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [c] [line 29]
+!18 = metadata !{metadata !"0xd\00end_of_file\0017\0032\0032\000\000", metadata !1, metadata !16, metadata !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int]
+!19 = metadata !{metadata !"0x100\00c\0029\000", metadata !9, metadata !10, metadata !13} ; [ DW_TAG_auto_variable ] [c] [line 29]
 !20 = metadata !{}
 !21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
 !22 = metadata !{i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0)}
-!23 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, metadata !38} ; [ DW_TAG_arg_variable ] [p2] [line 12]
-!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long>", metadata !"min<unsigned long long>", metadata !"_ZN3__13minIyEERKT_S3_RS1_", i32 12, metadata !27, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !33, null, metadata !35, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>]
-!25 = metadata !{i32 786489, metadata !26, null, metadata !"__1", i32 1} ; [ DW_TAG_namespace ] [__1] [line 1]
+!23 = metadata !{metadata !"0x101\00p2\0033554444\000", metadata !24, metadata !10, metadata !32, metadata !38} ; [ DW_TAG_arg_variable ] [p2] [line 12]
+!24 = metadata !{metadata !"0x2e\00min<unsigned long long>\00min<unsigned long long>\00_ZN3__13minIyEERKT_S3_RS1_\0012\000\001\000\006\00256\001\0012", metadata !1, metadata !25, metadata !27, null, null, metadata !33, null, metadata !35} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>]
+!25 = metadata !{metadata !"0x39\00__1\001", metadata !26, null} ; [ DW_TAG_namespace ] [__1] [line 1]
 !26 = metadata !{metadata !"main.cpp", metadata !"/Users/matt/ryan_bug"}
-!27 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !28, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !28 = metadata !{metadata !29, metadata !29, metadata !32}
-!29 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!30 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
-!31 = metadata !{i32 786468, null, null, metadata !"long long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!32 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!29 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!31 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!32 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
 !33 = metadata !{metadata !34}
-!34 = metadata !{i32 786479, null, metadata !"_Tp", metadata !31, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!34 = metadata !{metadata !"0x2f\00_Tp\000\000", null, metadata !31, null} ; [ DW_TAG_template_type_parameter ]
 !35 = metadata !{metadata !36, metadata !37}
-!36 = metadata !{i32 786689, metadata !24, metadata !"p1", metadata !10, i32 16777228, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 12]
-!37 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 12]
+!36 = metadata !{metadata !"0x101\00p1\0016777228\000", metadata !24, metadata !10, metadata !29} ; [ DW_TAG_arg_variable ] [p1] [line 12]
+!37 = metadata !{metadata !"0x101\00p2\0033554444\000", metadata !24, metadata !10, metadata !32} ; [ DW_TAG_arg_variable ] [p2] [line 12]
 !38 = metadata !{i32 33, i32 0, metadata !9, null}
 !39 = metadata !{i32 12, i32 0, metadata !24, metadata !38}
 !40 = metadata !{i32 9, i32 0, metadata !41, metadata !59}
-!41 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long, __1::A>", metadata !"min<unsigned long long, __1::A>", metadata !"_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", i32 7, metadata !42, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !53, null, metadata !55, i32 8} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>]
-!42 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !43, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!41 = metadata !{metadata !"0x2e\00min<unsigned long long, __1::A>\00min<unsigned long long, __1::A>\00_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_\007\000\001\000\006\00256\001\008", metadata !1, metadata !25, metadata !42, null, null, metadata !53, null, metadata !55} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>]
+!42 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !43, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !43 = metadata !{metadata !29, metadata !29, metadata !32, metadata !44}
-!44 = metadata !{i32 786451, metadata !1, metadata !25, metadata !"A", i32 0, i64 8, i64 8, i32 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ]
+!44 = metadata !{metadata !"0x13\00A\000\008\008\000\000\000", metadata !1, metadata !25, null, metadata !45, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ]
 !45 = metadata !{metadata !46}
-!46 = metadata !{i32 786478, metadata !1, metadata !44, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !52, i32 1} ; [ DW_TAG_subprogram ] [line 1] [operator()]
-!47 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !48, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!46 = metadata !{metadata !"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\000\000\006\00256\001\001", metadata !1, metadata !44, metadata !47, null, null, null, i32 0, metadata !52} ; [ DW_TAG_subprogram ] [line 1] [operator()]
+!47 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !48, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !48 = metadata !{metadata !13, metadata !49, metadata !50, metadata !50}
-!49 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
-!50 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!51 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!49 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!50 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!51 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
 !52 = metadata !{i32 786468}
 !53 = metadata !{metadata !34, metadata !54}
-!54 = metadata !{i32 786479, null, metadata !"_Compare", metadata !44, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!54 = metadata !{metadata !"0x2f\00_Compare\000\000", null, metadata !44, null} ; [ DW_TAG_template_type_parameter ]
 !55 = metadata !{metadata !56, metadata !57, metadata !58}
-!56 = metadata !{i32 786689, metadata !41, metadata !"p1", metadata !10, i32 16777223, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 7]
-!57 = metadata !{i32 786689, metadata !41, metadata !"p2", metadata !10, i32 33554439, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 7]
-!58 = metadata !{i32 786689, metadata !41, metadata !"p3", metadata !10, i32 50331656, metadata !44, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p3] [line 8]
+!56 = metadata !{metadata !"0x101\00p1\0016777223\000", metadata !41, metadata !10, metadata !29} ; [ DW_TAG_arg_variable ] [p1] [line 7]
+!57 = metadata !{metadata !"0x101\00p2\0033554439\000", metadata !41, metadata !10, metadata !32} ; [ DW_TAG_arg_variable ] [p2] [line 7]
+!58 = metadata !{metadata !"0x101\00p3\0050331656\000", metadata !41, metadata !10, metadata !44} ; [ DW_TAG_arg_variable ] [p3] [line 8]
 !59 = metadata !{i32 13, i32 0, metadata !24, metadata !38}
 !63 = metadata !{i32 undef}
-!64 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, metadata !40} ; [ DW_TAG_arg_variable ] [p1] [line 1]
-!65 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !46, metadata !66, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()]
+!64 = metadata !{metadata !"0x101\00p1\0033554433\000", metadata !65, metadata !10, metadata !50, metadata !40} ; [ DW_TAG_arg_variable ] [p1] [line 1]
+!65 = metadata !{metadata !"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\001\000\006\00256\001\002", metadata !1, metadata !25, metadata !47, null, null, null, metadata !46, metadata !66} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()]
 !66 = metadata !{metadata !67, metadata !69, metadata !70}
-!67 = metadata !{i32 786689, metadata !65, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!68 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!69 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 1]
-!70 = metadata !{i32 786689, metadata !65, metadata !"", metadata !10, i32 50331650, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 2]
+!67 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !65, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!68 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!69 = metadata !{metadata !"0x101\00p1\0033554433\000", metadata !65, metadata !10, metadata !50} ; [ DW_TAG_arg_variable ] [p1] [line 1]
+!70 = metadata !{metadata !"0x101\00\0050331650\000", metadata !65, metadata !10, metadata !50} ; [ DW_TAG_arg_variable ] [line 2]
 !71 = metadata !{i32 1, i32 0, metadata !65, metadata !40}
-!72 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!72 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/stack_guard_remat.ll b/test/CodeGen/X86/stack_guard_remat.ll
new file mode 100644
index 0000000..dd639a7
--- /dev/null
+++ b/test/CodeGen/X86/stack_guard_remat.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s -check-prefix=CHECK
+
+;CHECK:  foo2
+;CHECK:  movq ___stack_chk_guard@GOTPCREL(%rip), [[R0:%[a-z0-9]+]]
+;CHECK:  movq ([[R0]]), {{%[a-z0-9]+}}
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test_stack_guard_remat() #0 {
+entry:
+  %a1 = alloca [256 x i32], align 16
+  %0 = bitcast [256 x i32]* %a1 to i8*
+  call void @llvm.lifetime.start(i64 1024, i8* %0)
+  %arraydecay = getelementptr inbounds [256 x i32]* %a1, i64 0, i64 0
+  call void @foo3(i32* %arraydecay)
+  call void asm sideeffect "foo2", "~{r12},~{r13},~{r14},~{r15},~{ebx},~{esi},~{edi},~{dirflag},~{fpsr},~{flags}"()
+  call void @llvm.lifetime.end(i64 1024, i8* %0)
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/stackmap-fast-isel.ll b/test/CodeGen/X86/stackmap-fast-isel.ll
index 0b7e6db..dfb16ad 100644
--- a/test/CodeGen/X86/stackmap-fast-isel.ll
+++ b/test/CodeGen/X86/stackmap-fast-isel.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim                             | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim -fast-isel -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7                             | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort | FileCheck %s
 
 ; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
 ; CHECK-NEXT:  __LLVM_StackMaps:
diff --git a/test/CodeGen/X86/stackmap-large-constants.ll b/test/CodeGen/X86/stackmap-large-constants.ll
new file mode 100644
index 0000000..73ee4f3
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-large-constants.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+; CHECK-LABEL:	.section	__LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT: __LLVM_StackMaps:
+; version
+; CHECK-NEXT: 	.byte	1
+; reserved
+; CHECK-NEXT: 	.byte	0
+; reserved
+; CHECK-NEXT: 	.short	0
+; # functions
+; CHECK-NEXT: 	.long	2
+; # constants
+; CHECK-NEXT: 	.long	2
+; # records
+; CHECK-NEXT: 	.long	2
+; function address & stack size
+; CHECK-NEXT: 	.quad	_foo
+; CHECK-NEXT: 	.quad	8
+; function address & stack size
+; CHECK-NEXT: 	.quad	_bar
+; CHECK-NEXT: 	.quad	8
+
+; Constants Array:
+; CHECK-NEXT: 	.quad	9223372036854775807
+; CHECK-NEXT: 	.quad	-9223372036854775808
+
+; Patchpoint ID
+; CHECK-NEXT: 	.quad	0
+; Instruction offset
+; CHECK-NEXT: 	.long	L{{.*}}-_foo
+; reserved
+; CHECK-NEXT: 	.short	0
+; # locations
+; CHECK-NEXT: 	.short	1
+; ConstantIndex
+; CHECK-NEXT: 	.byte	5
+; reserved
+; CHECK-NEXT: 	.byte	8
+; Dwarf RegNum
+; CHECK-NEXT: 	.short	0
+; Offset
+; CHECK-NEXT: 	.long	0
+; padding
+; CHECK-NEXT: 	.short	0
+; NumLiveOuts
+; CHECK-NEXT: 	.short	0
+
+; CHECK-NEXT: 	.align	3
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+
+define void @foo() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 9223372036854775807)
+  ret void
+}
+
+; Patchpoint ID
+; CHECK-NEXT: 	.quad	0
+; Instruction Offset
+; CHECK-NEXT: 	.long	L{{.*}}-_bar
+; reserved
+; CHECK-NEXT: 	.short	0
+; # locations
+; CHECK-NEXT: 	.short	1
+; ConstantIndex
+; CHECK-NEXT: 	.byte	5
+; reserved
+; CHECK-NEXT: 	.byte	8
+; Dwarf RegNum
+; CHECK-NEXT: 	.short	0
+; Offset
+; CHECK-NEXT: 	.long	1
+; padding
+; CHECK-NEXT: 	.short	0
+; NumLiveOuts
+; CHECK-NEXT: 	.short	0
+
+
+define void @bar() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 -9223372036854775808)
+  ret void
+}
diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll
index 897595d..31553c0 100644
--- a/test/CodeGen/X86/stackmap-liveness.ll
+++ b/test/CodeGen/X86/stackmap-liveness.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim -enable-patchpoint-liveness=false | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim                                   | FileCheck -check-prefix=PATCH %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -enable-patchpoint-liveness=false | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx                                   | FileCheck -check-prefix=PATCH %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 
diff --git a/test/CodeGen/X86/stackmap-nops.ll b/test/CodeGen/X86/stackmap-nops.ll
index 5a78f24..7932c0d 100644
--- a/test/CodeGen/X86/stackmap-nops.ll
+++ b/test/CodeGen/X86/stackmap-nops.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
 
 define void @nop_test() {
 entry:
@@ -224,6 +224,10 @@ entry:
   tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 28, i32 28)
   tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 29, i32 29)
   tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 30, i32 30)
+; Add an extra stackmap with a zero-length shadow to thwart the shadow
+; optimization. This will force all 15 bytes of the previous shadow to be
+; padded with nops.
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 31, i32 0)
   ret void
 }
 
diff --git a/test/CodeGen/X86/stackmap-shadow-optimization.ll b/test/CodeGen/X86/stackmap-shadow-optimization.ll
new file mode 100644
index 0000000..a3725f2
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-shadow-optimization.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
+
+; Check that the X86 stackmap shadow optimization is only outputting a 3-byte
+; nop here. 8-bytes are requested, but 5 are covered by the code for the call to
+; bar.  However, the frame teardown and the return do not count towards the
+; stackmap shadow as the call return counts as a branch target so must flush
+; the shadow.
+; Note that in order for a thread to not return in to the patched space
+; the call must be at the end of the shadow, so the required nop must be
+; before the call, not after.
+define void @shadow_optimization_test() {
+entry:
+; CHECK-LABEL:  shadow_optimization_test:
+; CHECK:        callq   _bar
+; CHECK:        nop
+; CHECK:        callq   _bar
+; CHECK-NOT:    nop
+; CHECK:        callq   _bar
+; CHECK-NOT:    nop
+  call void @bar()
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 8)
+  call void @bar()
+  call void @bar()
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @bar()
diff --git a/test/CodeGen/X86/stackmap.ll b/test/CodeGen/X86/stackmap.ll
index 8567037..5e356f3 100644
--- a/test/CodeGen/X86/stackmap.ll
+++ b/test/CodeGen/X86/stackmap.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 
@@ -9,11 +9,11 @@
 ; CHECK-NEXT:   .byte 0
 ; CHECK-NEXT:   .short 0
 ; Num Functions
-; CHECK-NEXT:   .long 15
+; CHECK-NEXT:   .long 16
 ; Num LargeConstants
 ; CHECK-NEXT:   .long 3
 ; Num Callsites
-; CHECK-NEXT:   .long 19
+; CHECK-NEXT:   .long 20
 
 ; Functions and stack size
 ; CHECK-NEXT:   .quad _constantargs
@@ -46,6 +46,8 @@
 ; CHECK-NEXT:   .quad 8
 ; CHECK-NEXT:   .quad _clobberScratch
 ; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _needsStackRealignment
+; CHECK-NEXT:   .quad -1
 
 ; Large Constants
 ; CHECK-NEXT:   .quad   2147483648
@@ -464,6 +466,23 @@ define void @clobberScratch(i32 %a) {
   ret void
 }
 
+; A stack frame which needs to be realigned at runtime (to meet alignment 
+; criteria for values on the stack) does not have a fixed frame size. 
+; CHECK-LABEL:  .long L{{.*}}-_needsStackRealignment
+; CHECK-NEXT:   .short 0
+; 0 locations
+; CHECK-NEXT:   .short 0
+define void @needsStackRealignment() {
+  %val = alloca i64, i32 3, align 128
+  tail call void (...)* @escape_values(i64* %val)
+; Note: Adding any non-constant to the stackmap would fail because we
+; expected to be able to address off the frame pointer.  In a realigned
+; frame, we must use the stack pointer instead.  This is a separate bug.
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0)
+  ret void
+}
+declare void @escape_values(...)
+
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll
index 7557f25..e3cc2fa 100644
--- a/test/CodeGen/X86/store-narrow.ll
+++ b/test/CodeGen/X86/store-narrow.ll
@@ -34,8 +34,8 @@ entry:
 ; X64: movb	%sil, 1(%rdi)
 
 ; X32-LABEL: test2:
-; X32: movb	8(%esp), %[[REG:[abcd]l]]
-; X32: movb	%[[REG]], 1(%{{.*}})
+; X32: movb	8(%esp), %[[REG:[abcd]]]l
+; X32: movb	%[[REG]]l, 1(%{{.*}})
 }
 
 define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@@ -67,8 +67,8 @@ entry:
 ; X64: movw	%si, 2(%rdi)
 
 ; X32-LABEL: test4:
-; X32: movl	8(%esp), %e[[REG:[abcd]x]]
-; X32: movw	%[[REG]], 2(%{{.*}})
+; X32: movw	8(%esp), %[[REG:[abcd]]]x
+; X32: movw	%[[REG]]x, 2(%{{.*}})
 }
 
 define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@@ -84,8 +84,8 @@ entry:
 ; X64: movw	%si, 2(%rdi)
 
 ; X32-LABEL: test5:
-; X32: movzwl	8(%esp), %e[[REG:[abcd]x]]
-; X32: movw	%[[REG]], 2(%{{.*}})
+; X32: movw	8(%esp), %[[REG:[abcd]]]x
+; X32: movw	%[[REG]]x, 2(%{{.*}})
 }
 
 define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {
diff --git a/test/CodeGen/X86/swizzle-2.ll b/test/CodeGen/X86/swizzle-2.ll
index 4b1f903..697af84 100644
--- a/test/CodeGen/X86/swizzle-2.ll
+++ b/test/CodeGen/X86/swizzle-2.ll
@@ -8,508 +8,433 @@
 ; illegal shuffle that is expanded into a sub-optimal sequence of instructions
 ; during lowering stage.
 
-
 define <4 x i32> @swizzle_1(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_1
-; Mask: [1,0,3,2]
-; CHECK: pshufd $-79
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_2(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_2
-; Mask: [2,1,3,0]
-; CHECK: pshufd $54
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_3(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_3
-; Mask: [1,0,3,2]
-; CHECK: pshufd $-79
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_4(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_4
-; Mask: [3,1,0,2]
-; CHECK: pshufd $-121
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_5(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_5
-; Mask: [2,3,0,1]
-; CHECK: pshufd $78
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_6(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_6
-; Mask: [2,0,1,3]
-; CHECK: pshufd $-46
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_7(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_7
-; Mask: [0,2,3,1]
-; CHECK: pshufd $120
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_8(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_8
-; Mask: [1,3,2,0]
-; CHECK: pshufd $45
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_9(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_9
-; Mask: [2,3,0,1]
-; CHECK: pshufd $78
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_10(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,2,0,3]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_10
-; Mask: [1,2,0,3]
-; CHECK: pshufd $-55
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_11(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_11
-; Mask: [3,2,1,0]
-; CHECK: pshufd $27
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_12(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_12
-; Mask: [0,3,1,2]
-; CHECK: pshufd $-100
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_13(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_13
-; Mask: [3,2,1,0]
-; CHECK: pshufd $27
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_14(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_14:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,2,1]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_14
-; Mask: [3,0,2,1]
-; CHECK: pshufd $99
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_15(<4 x float> %v) {
+; CHECK-LABEL: swizzle_15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_15
-; Mask: [1,0,3,2]
-; CHECK: pshufd $-79
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_16(<4 x float> %v) {
+; CHECK-LABEL: swizzle_16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_16
-; Mask: [2,1,3,0]
-; CHECK: pshufd $54
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_17(<4 x float> %v) {
+; CHECK-LABEL: swizzle_17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_17
-; Mask: [1,0,3,2]
-; CHECK: pshufd $-79
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_18(<4 x float> %v) {
+; CHECK-LABEL: swizzle_18:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,0,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_18
-; Mask: [3,1,0,2]
-; CHECK: pshufd $-121
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_19(<4 x float> %v) {
+; CHECK-LABEL: swizzle_19:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_19
-; Mask: [2,3,0,1]
-; CHECK: pshufd $78
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_20(<4 x float> %v) {
+; CHECK-LABEL: swizzle_20:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_20
-; Mask: [2,0,1,3]
-; CHECK: pshufd $-46
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_21(<4 x float> %v) {
+; CHECK-LABEL: swizzle_21:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_21
-; Mask: [0,2,3,1]
-; CHECK: pshufd $120
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_22(<4 x float> %v) {
+; CHECK-LABEL: swizzle_22:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_22
-; Mask: [1,3,2,0]
-; CHECK: pshufd $45
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_23(<4 x float> %v) {
+; CHECK-LABEL: swizzle_23:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_23
-; Mask: [2,3,0,1]
-; CHECK: pshufd $78
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_24(<4 x float> %v) {
+; CHECK-LABEL: swizzle_24:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2,0,3]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_24
-; Mask: [1,2,0,3]
-; CHECK: pshufd $-55
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_25(<4 x float> %v) {
+; CHECK-LABEL: swizzle_25:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_25
-; Mask: [3,2,1,0]
-; CHECK: pshufd $27
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_26(<4 x float> %v) {
+; CHECK-LABEL: swizzle_26:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,1,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_26
-; Mask: [0,3,1,2]
-; CHECK: pshufd $-100
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_27(<4 x float> %v) {
+; CHECK-LABEL: swizzle_27:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_27
-; Mask: [3,2,1,0]
-; CHECK: pshufd $27
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_28(<4 x float> %v) {
+; CHECK-LABEL: swizzle_28:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,2,1]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_28
-; Mask: [3,0,2,1]
-; CHECK: pshufd $99
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_29(<4 x float> %v) {
+; CHECK-LABEL: swizzle_29:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_29
-; Mask: [1,3,2,0]
-; CHECK: pshufd $45
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
 
 ; Make sure that we combine the shuffles from each function below into a single
 ; legal shuffle (either pshuflw or pshufb depending on the masks).
 
 define <8 x i16> @swizzle_30(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_30:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 7, i32 5, i32 6, i32 4>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_30
-; Mask: [1,3,2,0,5,7,6,4]
-; CHECK: pshuflw $45
-; CHECK-NOT: pshufb
-; CHECK-NEXT: ret
-
 
 define <8 x i16> @swizzle_31(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_31:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_31
-; Mask: [1,3,2,0,4,5,6,7]
-; CHECK: pshuflw $45
-; CHECK-NOT: pshufb
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_32(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_32
-; Mask: [2,3,0,1,4,5,6,7] --> equivalent to pshufd mask [1,0,2,3]
-; CHECK: pshufd $-31
-; CHECK-NOT: pshufb
-; CHECK: ret
 
 define <8 x i16> @swizzle_33(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_33:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 5, i32 7, i32 2, i32 3, i32 1, i32 0>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 5, i32 7, i32 2, i32 3, i32 1, i32 0>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_33
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_34(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_34:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,0,2,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 7, i32 6, i32 5, i32 1, i32 2, i32 0, i32 3>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 7, i32 6, i32 5, i32 1, i32 2, i32 0, i32 3>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_34
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_35(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_35:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 1, i32 3, i32 0, i32 2>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 1, i32 3, i32 0, i32 2>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_35
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_36(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_36:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 7, i32 5, i32 0, i32 1, i32 3, i32 2>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 7, i32 5, i32 0, i32 1, i32 3, i32 2>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_36
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_37(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_37:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 5, i32 6, i32 4>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 4, i32 6, i32 5>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_37
-; Mask: [0,1,2,3,4,7,6,5]
-; CHECK: pshufhw $108
-; CHECK-NOT: pshufb
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_38(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_38:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 5, i32 6, i32 4, i32 7, i32 0, i32 2, i32 1, i32 3>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 5, i32 6, i32 4, i32 7, i32 0, i32 2, i32 1, i32 3>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_38
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_39(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_39:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,3,1,0,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 1, i32 0>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_39
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_40(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_40:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 6, i32 4, i32 7, i32 5, i32 1, i32 0, i32 3, i32 2>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 6, i32 4, i32 7, i32 5, i32 1, i32 0, i32 3, i32 2>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_40
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_41(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_41:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 0, i32 1, i32 3, i32 2>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 0, i32 1, i32 3, i32 2>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_41
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_42(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_42:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_42
-; Mask: [0,1,2,3,5,4,7,6]
-; CHECK: pshufhw $-79
-; CHECK-NOT: pshufb
-; CHECK: ret
-
-
diff --git a/test/CodeGen/X86/swizzle.ll b/test/CodeGen/X86/swizzle.ll
deleted file mode 100644
index 23e0c24..0000000
--- a/test/CodeGen/X86/swizzle.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movlps
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movsd
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep movups
-; rdar://6523650
-
-	%struct.vector4_t = type { <4 x float> }
-
-define void @swizzle(i8* nocapture %a, %struct.vector4_t* nocapture %b, %struct.vector4_t* nocapture %c) nounwind {
-entry:
-	%0 = getelementptr %struct.vector4_t* %b, i32 0, i32 0		; <<4 x float>*> [#uses=2]
-	%1 = load <4 x float>* %0, align 4		; <<4 x float>> [#uses=1]
-	%tmp.i = bitcast i8* %a to double*		; <double*> [#uses=1]
-	%tmp1.i = load double* %tmp.i		; <double> [#uses=1]
-	%2 = insertelement <2 x double> undef, double %tmp1.i, i32 0		; <<2 x double>> [#uses=1]
-	%tmp2.i = bitcast <2 x double> %2 to <4 x float>		; <<4 x float>> [#uses=1]
-	%3 = shufflevector <4 x float> %1, <4 x float> %tmp2.i, <4 x i32> < i32 4, i32 5, i32 2, i32 3 >		; <<4 x float>> [#uses=1]
-	store <4 x float> %3, <4 x float>* %0, align 4
-	ret void
-}
diff --git a/test/CodeGen/X86/tailcall-multiret.ll b/test/CodeGen/X86/tailcall-multiret.ll
new file mode 100644
index 0000000..a77a59c
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-multiret.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mcpu=core2 | FileCheck %s
+; See PR19530
+declare double    @llvm.powi.f64(double %Val, i32 %power)
+define <3 x double> @julia_foo17589(i32 %arg) {
+  %tmp1 = call double @llvm.powi.f64(double 1.000000e+00, i32 %arg)
+; CHECK: callq   __powidf2
+  %tmp2 = insertelement <3 x double> undef, double %tmp1, i32 0
+  %tmp3 = call double @llvm.powi.f64(double 2.000000e+00, i32 %arg)
+; CHECK: callq   __powidf2
+  %tmp4 = insertelement <3 x double> %tmp2, double %tmp3, i32 1
+  %tmp5 = call double @llvm.powi.f64(double 3.000000e+00, i32 %arg)
+; CHECK: callq   __powidf2
+  %tmp6 = insertelement <3 x double> %tmp4, double %tmp5, i32 2
+; CHECK-NOT: TAILCALL
+  ret <3 x double> %tmp6
+}
diff --git a/test/CodeGen/X86/tls-addr-non-leaf-function.ll b/test/CodeGen/X86/tls-addr-non-leaf-function.ll
new file mode 100644
index 0000000..ec47232
--- /dev/null
+++ b/test/CodeGen/X86/tls-addr-non-leaf-function.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -relocation-model=pic -O2 -disable-fp-elim -o - | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -O2 -o - | FileCheck %s
+
+; This test runs twice with different options regarding the frame pointer:
+; first the elimination is disabled, then it is enabled. The disabled case is
+; the "control group".
+; The function 'foo' below is marked with the "no-frame-pointer-elim-non-leaf"
+; attribute which dictates that the frame pointer should not be eliminated
+; unless the function is a leaf (i.e. it doesn't call any other function).
+; Now, 'foo' is not a leaf function, because it performs a TLS access which on
+; X86 ELF in PIC mode is expanded as a library call.
+; This call is represented with a pseudo-instruction which doesn't appear to be
+; a call when inspected by the analysis passes (it doesn't have the "isCall"
+; flag), and the ISel lowering code creating the pseudo was not informing the 
+; MachineFrameInfo that the function contained calls. This affected the decision
+; whether to eliminate the frame pointer.
+; With the fix, the "hasCalls" flag is set in the MFI for the function whenever
+; a TLS access pseudo-instruction is created, so 'foo' appears to be a non-leaf
+; function, and the difference in the options does not affect codegen: both
+; versions will have a frame pointer.
+
+; Test that there's some frame pointer usage in 'foo'...
+; CHECK: foo:
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; ... and the TLS library call is also present.
+; CHECK: leaq x@TLSGD(%rip), %rdi
+; CHECK: callq __tls_get_addr@PLT
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = thread_local global i32 0
+define i32 @foo() "no-frame-pointer-elim-non-leaf" {
+  %a = load i32* @x, align 4
+  ret i32 %a
+}
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index d230f1f..8de6297 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -20,7 +20,7 @@ define void @load_2_i8(<2 x i8>* %A)  {
 ; Read 32-bits
 ;CHECK: pmovzxwq
 ;CHECK: paddq
-;CHECK: pshufb
+;CHECK: pshufd
 ;CHECK: movd
 ;CHECK: ret
 define void @load_2_i16(<2 x i16>* %A)  {
@@ -32,7 +32,7 @@ define void @load_2_i16(<2 x i16>* %A)  {
 
 ;CHECK-LABEL: load_2_i32:
 ;CHECK: pmovzxdq
-;CHECK: paddq
+;CHECK: paddd
 ;CHECK: pshufd
 ;CHECK: ret
 define void @load_2_i32(<2 x i32>* %A)  {
@@ -56,7 +56,7 @@ define void @load_4_i8(<4 x i8>* %A)  {
 
 ;CHECK-LABEL: load_4_i16:
 ;CHECK: pmovzxwd
-;CHECK: paddd
+;CHECK: paddw
 ;CHECK: pshufb
 ;CHECK: ret
 define void @load_4_i16(<4 x i16>* %A)  {
@@ -68,7 +68,7 @@ define void @load_4_i16(<4 x i16>* %A)  {
 
 ;CHECK-LABEL: load_8_i8:
 ;CHECK: pmovzxbw
-;CHECK: paddw
+;CHECK: paddb
 ;CHECK: pshufb
 ;CHECK: ret
 define void @load_8_i8(<8 x i8>* %A)  {
diff --git a/test/CodeGen/X86/uint_to_fp-2.ll b/test/CodeGen/X86/uint_to_fp-2.ll
index c5a61c3..e47f154 100644
--- a/test/CodeGen/X86/uint_to_fp-2.ll
+++ b/test/CodeGen/X86/uint_to_fp-2.ll
@@ -1,15 +1,20 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mattr=+sse2 | FileCheck %s
 
 ; rdar://6504833
 define float @test1(i32 %x) nounwind readnone {
-; CHECK: test1
-; CHECK: movd
-; CHECK: orps
-; CHECK: subsd
-; CHECK: cvtsd2ss
-; CHECK: movss
-; CHECK: flds
-; CHECK: ret
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movsd .LCPI0_0, %xmm0
+; CHECK-NEXT:    movd {{[0-9]+}}(%esp), %xmm1
+; CHECK-NEXT:    orps %xmm0, %xmm1
+; CHECK-NEXT:    subsd %xmm0, %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    cvtsd2ss %xmm1, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    flds (%esp)
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
 entry:
 	%0 = uitofp i32 %x to float
 	ret float %0
@@ -17,15 +22,20 @@ entry:
 
 ; PR10802
 define float @test2(<4 x i32> %x) nounwind readnone ssp {
-; CHECK: test2
-; CHECK: xorps [[ZERO:%xmm[0-9]+]]
-; CHECK: movss {{.*}}, [[ZERO]]
-; CHECK: orps
-; CHECK: subsd
-; CHECK: cvtsd2ss
-; CHECK: movss
-; CHECK: flds
-; CHECK: ret
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    movss %xmm0, %xmm1
+; CHECK-NEXT:    movsd .LCPI1_0, %xmm0
+; CHECK-NEXT:    orps %xmm0, %xmm1
+; CHECK-NEXT:    subsd %xmm0, %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    cvtsd2ss %xmm1, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    flds (%esp)
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
 entry:
   %vecext = extractelement <4 x i32> %x, i32 0
   %conv = uitofp i32 %vecext to float
diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll
index d7ae469..ca9ea4a 100644
--- a/test/CodeGen/X86/unknown-location.ll
+++ b/test/CodeGen/X86/unknown-location.ll
@@ -21,16 +21,16 @@ entry:
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !2, i32 1, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !10, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 1, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, i32, i32, i32)* @foo, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !10, i32 12, metadata !"producer", i1 false, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00x\001\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\001\000\001\000\006\000\000\001", metadata !10, metadata !2, metadata !4, null, i32 (i32, i32, i32, i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\0012\00producer\000\00\000\00\000", metadata !10, metadata !11, metadata !11, metadata !9, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 786468, metadata !10, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786443, metadata !2, metadata !1, i32 1, i32 30, i32 0} ; [ DW_TAG_lexical_block ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !10, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0xb\001\0030\000", metadata !2, metadata !1} ; [ DW_TAG_lexical_block ]
 !8 = metadata !{i32 4, i32 3, metadata !7, null}
 !9 = metadata !{metadata !1}
 !10 = metadata !{metadata !"test.c", metadata !"/dir"}
 !11 = metadata !{i32 0}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
diff --git a/test/CodeGen/X86/v-binop-widen.ll b/test/CodeGen/X86/v-binop-widen.ll
deleted file mode 100644
index fca4da6..0000000
--- a/test/CodeGen/X86/v-binop-widen.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc -mcpu=generic -march=x86 -mattr=+sse < %s | FileCheck %s
-; CHECK: divps
-; CHECK: divps
-; CHECK: divss
-
-%vec = type <9 x float>
-define %vec @vecdiv( %vec %p1, %vec %p2)
-{
-  %result = fdiv %vec %p1, %p2
-  ret %vec %result
-}
diff --git a/test/CodeGen/X86/v-binop-widen2.ll b/test/CodeGen/X86/v-binop-widen2.ll
deleted file mode 100644
index 3342111..0000000
--- a/test/CodeGen/X86/v-binop-widen2.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc -march=x86 -mcpu=generic -mattr=+sse < %s | FileCheck %s
-; RUN: llc -march=x86 -mcpu=atom -mattr=+sse < %s | FileCheck -check-prefix=ATOM %s
-
-%vec = type <6 x float>
-; CHECK: divps
-; CHECK: divss
-; CHECK: divss
-
-; Scheduler causes a different instruction order to be produced on Intel Atom
-; ATOM: divps
-; ATOM: divss
-; ATOM: divss
-
-define %vec @vecdiv( %vec %p1, %vec %p2)
-{
-  %result = fdiv %vec %p1, %p2
-  ret %vec %result
-}
-
-@a = constant %vec < float 2.0, float 4.0, float 8.0, float 16.0, float 32.0, float 64.0 >
-@b = constant %vec < float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0 >
-
-; Expected result: < 1.0, 2.0, 4.0, ..., 2.0^(n-1) >
-; main() returns 0 if the result is expected and 1 otherwise
-; to execute, use llvm-as < %s | lli
-define i32 @main() nounwind {
-entry:
-  %avec = load %vec* @a
-  %bvec = load %vec* @b
-
-  %res = call %vec @vecdiv(%vec %avec, %vec %bvec)
-  br label %loop
-loop:
-  %idx = phi i32 [0, %entry], [%nextInd, %looptail]
-  %expected = phi float [1.0, %entry], [%nextExpected, %looptail]
-  %elem = extractelement %vec %res, i32 %idx
-  %expcmp = fcmp oeq float %elem, %expected
-  br i1 %expcmp, label %looptail, label %return
-looptail:
-  %nextExpected = fmul float %expected, 2.0
-  %nextInd = add i32 %idx, 1
-  %cmp = icmp slt i32 %nextInd, 6
-  br i1 %cmp, label %loop, label %return
-return:
-  %retval = phi i32 [0, %looptail], [1, %loop]
-  ret i32 %retval
-}
diff --git a/test/CodeGen/X86/v2f32.ll b/test/CodeGen/X86/v2f32.ll
index dab5e7b..b9bd80f9 100644
--- a/test/CodeGen/X86/v2f32.ll
+++ b/test/CodeGen/X86/v2f32.ll
@@ -1,115 +1,94 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -asm-verbose=0 -o - | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -asm-verbose=0 -o - | FileCheck %s -check-prefix=W64
-; RUN: llc < %s -mcpu=yonah -march=x86 -mtriple=i386-linux-gnu -asm-verbose=0 -o - | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -o - | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mcpu=yonah -march=x86 -mtriple=i386-linux-gnu -o - | FileCheck %s --check-prefix=X32
 
 ; PR7518
 define void @test1(<2 x float> %Q, float *%P2) nounwind {
+; X64-LABEL: test1:
+; X64:       # BB#0:
+; X64-NEXT:    movaps %xmm0, %xmm1
+; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT:    addss %xmm0, %xmm1
+; X64-NEXT:    movss %xmm1, (%rdi)
+; X64-NEXT:    retq
+;
+; X32-LABEL: test1:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movaps %xmm0, %xmm1
+; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X32-NEXT:    addss %xmm0, %xmm1
+; X32-NEXT:    movss %xmm1, (%eax)
+; X32-NEXT:    retl
   %a = extractelement <2 x float> %Q, i32 0
   %b = extractelement <2 x float> %Q, i32 1
   %c = fadd float %a, %b
-
   store float %c, float* %P2
   ret void
-; X64-LABEL: test1:
-; X64-NEXT: pshufd	$1, %xmm0, %xmm1
-; X64-NEXT: addss	%xmm0, %xmm1
-; X64-NEXT: movss	%xmm1, (%rdi)
-; X64-NEXT: ret
-
-; W64-LABEL: test1:
-; W64-NEXT: movdqa  (%rcx), %xmm0
-; W64-NEXT: pshufd  $1, %xmm0, %xmm1
-; W64-NEXT: addss   %xmm0, %xmm1
-; W64-NEXT: movss   %xmm1, (%rdx)
-; W64-NEXT: ret
-
-; X32-LABEL: test1:
-; X32-NEXT: movl	4(%esp), %eax
-; X32-NEXT: pshufd	$1, %xmm0, %xmm1
-; X32-NEXT: addss	%xmm0, %xmm1
-; X32-NEXT: movss	%xmm1, (%eax)
-; X32-NEXT: ret
 }
 
-
 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, <2 x float> *%P) nounwind {
-  %Z = fadd <2 x float> %Q, %R
-  ret <2 x float> %Z
-  
 ; X64-LABEL: test2:
-; X64-NEXT: addps	%xmm1, %xmm0
-; X64-NEXT: ret
-
-; W64-LABEL: test2:
-; W64-NEXT: movaps  (%rcx), %xmm0
-; W64-NEXT: addps   (%rdx), %xmm0
-; W64-NEXT: ret
-
+; X64:       # BB#0:
+; X64-NEXT:    addps %xmm1, %xmm0
+; X64-NEXT:    retq
+;
 ; X32-LABEL: test2:
-; X32:      addps	%xmm1, %xmm0
+; X32:       # BB#0:
+; X32-NEXT:    addps %xmm1, %xmm0
+; X32-NEXT:    retl
+  %Z = fadd <2 x float> %Q, %R
+  ret <2 x float> %Z
 }
 
-
 define <2 x float> @test3(<4 x float> %A) nounwind {
+; X64-LABEL: test3:
+; X64:       # BB#0:
+; X64-NEXT:    addps %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test3:
+; X32:       # BB#0:
+; X32-NEXT:    addps %xmm0, %xmm0
+; X32-NEXT:    retl
 	%B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
 	%C = fadd <2 x float> %B, %B
 	ret <2 x float> %C
-; X64-LABEL: test3:
-; X64-NEXT: addps	%xmm0, %xmm0
-; X64-NEXT: ret
-
-; W64-LABEL: test3:
-; W64-NEXT: movaps  (%rcx), %xmm0
-; W64-NEXT: addps   %xmm0, %xmm0
-; W64-NEXT: ret
-
-; X32-LABEL: test3:
-; X32-NEXT: addps	%xmm0, %xmm0
-; X32-NEXT: ret
 }
 
 define <2 x float> @test4(<2 x float> %A) nounwind {
-	%C = fadd <2 x float> %A, %A
-	ret <2 x float> %C
 ; X64-LABEL: test4:
-; X64-NEXT: addps	%xmm0, %xmm0
-; X64-NEXT: ret
-
-; W64-LABEL: test4:
-; W64-NEXT: movaps  (%rcx), %xmm0
-; W64-NEXT: addps   %xmm0, %xmm0
-; W64-NEXT: ret
-
+; X64:       # BB#0:
+; X64-NEXT:    addps %xmm0, %xmm0
+; X64-NEXT:    retq
+;
 ; X32-LABEL: test4:
-; X32-NEXT: addps	%xmm0, %xmm0
-; X32-NEXT: ret
+; X32:       # BB#0:
+; X32-NEXT:    addps %xmm0, %xmm0
+; X32-NEXT:    retl
+	%C = fadd <2 x float> %A, %A
+	ret <2 x float> %C
 }
 
 define <4 x float> @test5(<4 x float> %A) nounwind {
+; X64-LABEL: test5:
+; X64:       # BB#0:
+; X64-NEXT:    addps %xmm0, %xmm0
+; X64-NEXT:    addps %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test5:
+; X32:       # BB#0:
+; X32-NEXT:    addps %xmm0, %xmm0
+; X32-NEXT:    addps %xmm0, %xmm0
+; X32-NEXT:    retl
 	%B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
 	%C = fadd <2 x float> %B, %B
-        br label %BB
-        
+  br label %BB
+
 BB:
-        %D = fadd <2 x float> %C, %C
+  %D = fadd <2 x float> %C, %C
 	%E = shufflevector <2 x float> %D, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 	ret <4 x float> %E
-        
-; X64-LABEL: test5:
-; X64-NEXT: addps	%xmm0, %xmm0
-; X64-NEXT: addps	%xmm0, %xmm0
-; X64-NEXT: ret
-
-; W64-LABEL: test5:
-; W64-NEXT: movaps  (%rcx), %xmm0
-; W64-NEXT: addps   %xmm0, %xmm0
-; W64-NEXT: addps   %xmm0, %xmm0
-; W64-NEXT: ret
-
-; X32-LABEL: test5:
-; X32-NEXT: addps	%xmm0, %xmm0
-; X32-NEXT: addps	%xmm0, %xmm0
-; X32-NEXT: ret
 }
 
 
diff --git a/test/CodeGen/X86/vararg-callee-cleanup.ll b/test/CodeGen/X86/vararg-callee-cleanup.ll
new file mode 100644
index 0000000..2dcf319
--- /dev/null
+++ b/test/CodeGen/X86/vararg-callee-cleanup.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=i686-pc-windows < %s | FileCheck %s
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+
+declare x86_thiscallcc void @thiscall_thunk(i8* %this, ...)
+define i32 @call_varargs_thiscall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
+  call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2)
+  call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2)
+  %t1 = add i32 %b, %c
+  %r = add i32 %t1, %d
+  ret i32 %r
+}
+
+; CHECK: _call_varargs_thiscall_thunk:
+; CHECK: calll _thiscall_thunk
+; CHECK-NEXT: subl $8, %esp
+
+; We don't mangle the argument size into variadic callee cleanup functions.
+
+declare x86_stdcallcc void @stdcall_thunk(i8* %this, ...)
+define i32 @call_varargs_stdcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
+  call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2)
+  call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2)
+  %t1 = add i32 %b, %c
+  %r = add i32 %t1, %d
+  ret i32 %r
+}
+
+; CHECK: _call_varargs_stdcall_thunk:
+; CHECK: calll _stdcall_thunk{{$}}
+; CHECK-NEXT: subl $12, %esp
+
+declare x86_fastcallcc void @fastcall_thunk(i8* %this, ...)
+define i32 @call_varargs_fastcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
+  call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
+  call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
+  %t1 = add i32 %b, %c
+  %r = add i32 %t1, %d
+  ret i32 %r
+}
+
+; CHECK: _call_varargs_fastcall_thunk:
+; CHECK: calll @fastcall_thunk{{$}}
+; CHECK-NEXT: subl $4, %esp
+
+; If you actually return from such a thunk, it will only pop the non-variadic
+; portion of the arguments, which is different from what the callee passes.
+
+define x86_stdcallcc void @varargs_stdcall_return(i32, i32, ...) {
+  ret void
+}
+
+; CHECK: _varargs_stdcall_return:
+; CHECK: retl $8
diff --git a/test/CodeGen/X86/vararg_no_start.ll b/test/CodeGen/X86/vararg_no_start.ll
new file mode 100644
index 0000000..ab5c6fc
--- /dev/null
+++ b/test/CodeGen/X86/vararg_no_start.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s
+
+define void @foo(i8*, ...) {
+  ret void
+}
+; CHECK-LABEL: {{^_?}}foo:
+; CHECK-NOT: movq
+; CHECK: retq
diff --git a/test/CodeGen/X86/vastart-defs-eflags.ll b/test/CodeGen/X86/vastart-defs-eflags.ll
index 6017753..d0c5150 100644
--- a/test/CodeGen/X86/vastart-defs-eflags.ll
+++ b/test/CodeGen/X86/vastart-defs-eflags.ll
@@ -14,6 +14,7 @@ entry:
   br i1 %tobool, label %if.end, label %if.then
 
 if.then:                                          ; preds = %entry
+  call void @llvm.va_start(i8* null)
   br label %if.end
 
 if.end:                                           ; preds = %entry, %if.then
@@ -21,3 +22,4 @@ if.end:                                           ; preds = %entry, %if.then
   ret i32 %hasflag
 }
 
+declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index 1a6c05d..8600c48 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -1,75 +1,177 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
 
-;CHECK-LABEL: foo1_8:
-;CHECK: vcvtdq2ps
-;CHECK: ret
-;
-;CHECK-WIDE-LABEL: foo1_8:
-;CHECK-WIDE:      vpmovzxbd %xmm0, %xmm1
-;CHECK-WIDE-NEXT: vpslld $24, %xmm1, %xmm1
-;CHECK-WIDE-NEXT: vpsrad $24, %xmm1, %xmm1
-;CHECK-WIDE-NEXT: vpshufb {{.*}}, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-;CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0
-;CHECK-WIDE-NEXT: ret
 define <8 x float> @foo1_8(<8 x i8> %src) {
+; CHECK-LABEL: foo1_8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm0
+; CHECK-NEXT:    vpslld $24, %xmm0, %xmm0
+; CHECK-NEXT:    vpsrad $24, %xmm0, %xmm0
+; CHECK-NEXT:    vpslld $24, %xmm1, %xmm1
+; CHECK-NEXT:    vpsrad $24, %xmm1, %xmm1
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT:    retl
+;
+; CHECK-WIDE-LABEL: foo1_8:
+; CHECK-WIDE:       ## BB#0:
+; CHECK-WIDE-NEXT:    vpmovzxbd %xmm0, %xmm1
+; CHECK-WIDE-NEXT:    vpslld $24, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpsrad $24, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-WIDE-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-WIDE-NEXT:    vpslld $24, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpsrad $24, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-WIDE-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-WIDE-NEXT:    retl
   %res = sitofp <8 x i8> %src to <8 x float>
   ret <8 x float> %res
 }
 
-;CHECK-LABEL: foo1_4:
-;CHECK: vcvtdq2ps
-;CHECK: ret
-;
-;CHECK-WIDE-LABEL: foo1_4:
-;CHECK-WIDE:      vpmovzxbd %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
-;CHECK-WIDE-NEXT: ret
 define <4 x float> @foo1_4(<4 x i8> %src) {
+; CHECK-LABEL: foo1_4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpslld $24, %xmm0, %xmm0
+; CHECK-NEXT:    vpsrad $24, %xmm0, %xmm0
+; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-WIDE-LABEL: foo1_4:
+; CHECK-WIDE:       ## BB#0:
+; CHECK-WIDE-NEXT:    vpmovzxbd %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpslld $24, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpsrad $24, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    retl
   %res = sitofp <4 x i8> %src to <4 x float>
   ret <4 x float> %res
 }
 
-;CHECK-LABEL: foo2_8:
-;CHECK: vcvtdq2ps
-;CHECK: ret
-;
-;CHECK-WIDE-LABEL: foo2_8:
-;CHECK-WIDE: vcvtdq2ps %ymm{{.*}}, %ymm{{.*}}
-;CHECK-WIDE: ret
 define <8 x float> @foo2_8(<8 x i8> %src) {
+; CHECK-LABEL: foo2_8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm1
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    vandps LCPI2_0, %ymm0, %ymm0
+; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT:    retl
+;
+; CHECK-WIDE-LABEL: foo2_8:
+; CHECK-WIDE:       ## BB#0:
+; CHECK-WIDE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-WIDE-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; CHECK-WIDE-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; CHECK-WIDE-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; CHECK-WIDE-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; CHECK-WIDE-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; CHECK-WIDE-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
+; CHECK-WIDE-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-WIDE-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-WIDE-NEXT:    retl
   %res = uitofp <8 x i8> %src to <8 x float>
   ret <8 x float> %res
 }
 
-;CHECK-LABEL: foo2_4:
-;CHECK: vcvtdq2ps
-;CHECK: ret
-;
-;CHECK-WIDE-LABEL: foo2_4:
-;CHECK-WIDE: vcvtdq2ps %xmm{{.*}}, %xmm{{.*}}
-;CHECK-WIDE: ret
 define <4 x float> @foo2_4(<4 x i8> %src) {
+; CHECK-LABEL: foo2_4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vandps LCPI3_0, %xmm0, %xmm0
+; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-WIDE-LABEL: foo2_4:
+; CHECK-WIDE:       ## BB#0:
+; CHECK-WIDE-NEXT:    vpmovzxbd %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    retl
   %res = uitofp <4 x i8> %src to <4 x float>
   ret <4 x float> %res
 }
 
-;CHECK-LABEL: foo3_8:
-;CHECK: vcvttps2dq
-;CHECK: ret
 define <8 x i8> @foo3_8(<8 x float> %src) {
+; CHECK-LABEL: foo3_8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
+;
+; CHECK-WIDE-LABEL: foo3_8:
+; CHECK-WIDE:       ## BB#0:
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT:    shll $8, %eax
+; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %ecx
+; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
+; CHECK-WIDE-NEXT:    orl %eax, %ecx
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT:    shll $8, %eax
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %edx
+; CHECK-WIDE-NEXT:    movzbl %dl, %edx
+; CHECK-WIDE-NEXT:    orl %eax, %edx
+; CHECK-WIDE-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
+; CHECK-WIDE-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    shll $8, %eax
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
+; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
+; CHECK-WIDE-NEXT:    orl %eax, %ecx
+; CHECK-WIDE-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    shll $8, %eax
+; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
+; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
+; CHECK-WIDE-NEXT:    orl %eax, %ecx
+; CHECK-WIDE-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm0
+; CHECK-WIDE-NEXT:    vzeroupper
+; CHECK-WIDE-NEXT:    retl
   %res = fptosi <8 x float> %src to <8 x i8>
   ret <8 x i8> %res
 }
-;CHECK-LABEL: foo3_4:
-;CHECK: vcvttps2dq
-;CHECK: ret
+
 define <4 x i8> @foo3_4(<4 x float> %src) {
+; CHECK-LABEL: foo3_4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-WIDE-LABEL: foo3_4:
+; CHECK-WIDE:       ## BB#0:
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT:    shll $8, %eax
+; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %ecx
+; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
+; CHECK-WIDE-NEXT:    orl %eax, %ecx
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT:    shll $8, %eax
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %edx
+; CHECK-WIDE-NEXT:    movzbl %dl, %edx
+; CHECK-WIDE-NEXT:    orl %eax, %edx
+; CHECK-WIDE-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    retl
   %res = fptosi <4 x float> %src to <4 x i8>
   ret <4 x i8> %res
 }
diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll
deleted file mode 100644
index 4da7953..0000000
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc < %s -mtriple=i686-linux -mcpu=penryn | FileCheck %s
-
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
-entry:
-; CHECK: cfi_def_cfa_offset
-; CHECK-NOT: set
-; CHECK: pmovzxwq
-; CHECK: pshufb
-  %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
-  %cmp318.i = sext <4 x i1> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
-  %sub322.i = sub <4 x i32> %shr.i, zeroinitializer ; <<4 x i32>> [#uses=1]
-  %cmp323.x = icmp slt <4 x i32> zeroinitializer, %sub322.i ; <<4 x i1>> [#uses=1]
-  %cmp323.i = sext <4 x i1> %cmp323.x to <4 x i32> ; <<4 x i32>> [#uses=1]
-  %or.i = or <4 x i32> %cmp318.i, %cmp323.i       ; <<4 x i32>> [#uses=1]
-  %tmp10.i83.i = bitcast <4 x i32> %or.i to <4 x float> ; <<4 x float>> [#uses=1]
-  %0 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> undef, <4 x float> undef, <4 x float> %tmp10.i83.i) nounwind ; <<4 x float>> [#uses=1]
-  %conv.i.i15.i = bitcast <4 x float> %0 to <4 x i32> ; <<4 x i32>> [#uses=1]
-  %swz.i.i28.i = shufflevector <4 x i32> %conv.i.i15.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1> ; <<2 x i32>> [#uses=1]
-  %tmp6.i29.i = bitcast <2 x i32> %swz.i.i28.i to <4 x i16> ; <<4 x i16>> [#uses=1]
-  %swz.i30.i = shufflevector <4 x i16> %tmp6.i29.i, <4 x i16> undef, <2 x i32> <i32 0, i32 1> ; <<2 x i16>> [#uses=1]
-  store <2 x i16> %swz.i30.i, <2 x i16>* undef
-  unreachable
-  ret void
-}
diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll
index 0aa72b1..318aca1 100644
--- a/test/CodeGen/X86/vec_ctbits.ll
+++ b/test/CodeGen/X86/vec_ctbits.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s
 
 declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
 declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)
diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll
index 3cb519a..530911a 100644
--- a/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/test/CodeGen/X86/vec_extract-sse4.ll
@@ -1,10 +1,14 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 -o %t
-; RUN: not grep extractps   %t
-; RUN: not grep pextrd      %t
-; RUN: not grep pshufd  %t
-; RUN: not grep movss   %t
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 | FileCheck %s
 
 define void @t1(float* %R, <4 x float>* %P1) nounwind {
+; CHECK-LABEL: t1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movss 12(%ecx), %xmm0
+; CHECK-NEXT:    movss %xmm0, (%eax)
+; CHECK-NEXT:    retl
+
 	%X = load <4 x float>* %P1
 	%tmp = extractelement <4 x float> %X, i32 3
 	store float %tmp, float* %R
@@ -12,12 +16,31 @@ define void @t1(float* %R, <4 x float>* %P1) nounwind {
 }
 
 define float @t2(<4 x float>* %P1) nounwind {
+; CHECK-LABEL: t2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movapd (%eax), %xmm0
+; CHECK-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    flds (%esp)
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
+
 	%X = load <4 x float>* %P1
 	%tmp = extractelement <4 x float> %X, i32 2
 	ret float %tmp
 }
 
 define void @t3(i32* %R, <4 x i32>* %P1) nounwind {
+; CHECK-LABEL: t3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl 12(%ecx), %ecx
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    retl
+
 	%X = load <4 x i32>* %P1
 	%tmp = extractelement <4 x i32> %X, i32 3
 	store i32 %tmp, i32* %R
@@ -25,6 +48,12 @@ define void @t3(i32* %R, <4 x i32>* %P1) nounwind {
 }
 
 define i32 @t4(<4 x i32>* %P1) nounwind {
+; CHECK-LABEL: t4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl 12(%eax), %eax
+; CHECK-NEXT:    retl
+
 	%X = load <4 x i32>* %P1
 	%tmp = extractelement <4 x i32> %X, i32 3
 	ret i32 %tmp
diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll
index 88f5a58..6df7be7 100644
--- a/test/CodeGen/X86/vec_extract.ll
+++ b/test/CodeGen/X86/vec_extract.ll
@@ -1,10 +1,17 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 -o %t
-; RUN: grep movss    %t | count 4
-; RUN: grep movhlps  %t | count 1
-; RUN: not grep pshufd   %t 
-; RUN: grep unpckhpd %t | count 1
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
 
 define void @test1(<4 x float>* %F, float* %f) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movaps (%ecx), %xmm0
+; CHECK-NEXT:    addps %xmm0, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%eax)
+; CHECK-NEXT:    retl
+entry:
 	%tmp = load <4 x float>* %F		; <<4 x float>> [#uses=2]
 	%tmp7 = fadd <4 x float> %tmp, %tmp		; <<4 x float>> [#uses=1]
 	%tmp2 = extractelement <4 x float> %tmp7, i32 0		; <float> [#uses=1]
@@ -13,6 +20,18 @@ define void @test1(<4 x float>* %F, float* %f) nounwind {
 }
 
 define float @test2(<4 x float>* %F, float* %f) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movaps (%eax), %xmm0
+; CHECK-NEXT:    addps %xmm0, %xmm0
+; CHECK-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    flds (%esp)
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
+entry:
 	%tmp = load <4 x float>* %F		; <<4 x float>> [#uses=2]
 	%tmp7 = fadd <4 x float> %tmp, %tmp		; <<4 x float>> [#uses=1]
 	%tmp2 = extractelement <4 x float> %tmp7, i32 2		; <float> [#uses=1]
@@ -20,6 +39,14 @@ define float @test2(<4 x float>* %F, float* %f) nounwind {
 }
 
 define void @test3(float* %R, <4 x float>* %P1) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movss 12(%ecx), %xmm0
+; CHECK-NEXT:    movss %xmm0, (%eax)
+; CHECK-NEXT:    retl
+entry:
 	%X = load <4 x float>* %P1		; <<4 x float>> [#uses=1]
 	%tmp = extractelement <4 x float> %X, i32 3		; <float> [#uses=1]
 	store float %tmp, float* %R
@@ -27,6 +54,17 @@ define void @test3(float* %R, <4 x float>* %P1) nounwind {
 }
 
 define double @test4(double %A) nounwind {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    calll foo
+; CHECK-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    addsd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    movsd %xmm0, (%esp)
+; CHECK-NEXT:    fldl (%esp)
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+entry:
 	%tmp1 = call <2 x double> @foo( )		; <<2 x double>> [#uses=1]
 	%tmp2 = extractelement <2 x double> %tmp1, i32 1		; <double> [#uses=1]
 	%tmp3 = fadd double %tmp2, %A		; <double> [#uses=1]
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
index 82517cb..ac02acf 100644
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s
 
 
 define <2 x double> @fabs_v2f64(<2 x double> %p)
 {
-  ; CHECK: fabs_v2f64
+  ; CHECK-LABEL: fabs_v2f64
   ; CHECK: vandps
   %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
   ret <2 x double> %t
@@ -12,7 +12,7 @@ declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
 
 define <4 x float> @fabs_v4f32(<4 x float> %p)
 {
-  ; CHECK: fabs_v4f32
+  ; CHECK-LABEL: fabs_v4f32
   ; CHECK: vandps
   %t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
   ret <4 x float> %t
@@ -21,7 +21,7 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
 
 define <4 x double> @fabs_v4f64(<4 x double> %p)
 {
-  ; CHECK: fabs_v4f64
+  ; CHECK-LABEL: fabs_v4f64
   ; CHECK: vandps
   %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
   ret <4 x double> %t
@@ -30,9 +30,46 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
 
 define <8 x float> @fabs_v8f32(<8 x float> %p)
 {
-  ; CHECK: fabs_v8f32
+  ; CHECK-LABEL: fabs_v8f32
   ; CHECK: vandps
   %t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
 declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+
+; PR20354: when generating code for a vector fabs op,
+; make sure that we're only turning off the sign bit of each float value.
+; No constant pool loads or vector ops are needed for the fabs of a
+; bitcasted integer constant; we should just return an integer constant
+; that has the sign bits turned off.
+;
+; So instead of something like this:
+;    movabsq (constant pool load of mask for sign bits) 
+;    vmovq   (move from integer register to vector/fp register)
+;    vandps  (mask off sign bits)
+;    vmovq   (move vector/fp register back to integer return register)
+;
+; We should generate:
+;    mov     (put constant value in return register)
+
+define i64 @fabs_v2f32_1() {
+; CHECK-LABEL: fabs_v2f32_1:
+; CHECK: movabsq $9223372032559808512, %rax # imm = 0x7FFFFFFF00000000
+; CHECK-NEXT: retq
+ %bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+}
+
+define i64 @fabs_v2f32_2() {
+; CHECK-LABEL: fabs_v2f32_2:
+; CHECK: movl $2147483647, %eax       # imm = 0x7FFFFFFF
+; CHECK-NEXT: retq
+ %bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+}
+
+declare <2 x float> @llvm.fabs.v2f32(<2 x float> %p)
diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll
index d49c70e..9743f71 100644
--- a/test/CodeGen/X86/vec_fneg.ll
+++ b/test/CodeGen/X86/vec_fneg.ll
@@ -1,11 +1,45 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s
 
+; FNEG is defined as subtraction from -0.0.
+
+; This test verifies that we use an xor with a constant to flip the sign bits; no subtraction needed.
 define <4 x float> @t1(<4 x float> %Q) {
-        %tmp15 = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q
-	ret <4 x float> %tmp15
+; CHECK-LABEL: t1:
+; CHECK: xorps	{{.*}}LCPI0_0{{.*}}, %xmm0
+; CHECK-NEXT: retq
+        %tmp = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q
+	ret <4 x float> %tmp
 }
 
+; This test verifies that we generate an FP subtraction because "0.0 - x" is not an fneg.
 define <4 x float> @t2(<4 x float> %Q) {
-        %tmp15 = fsub <4 x float> zeroinitializer, %Q
-	ret <4 x float> %tmp15
+; CHECK-LABEL: t2:
+; CHECK: xorps	%[[X:xmm[0-9]+]], %[[X]]
+; CHECK-NEXT: subps	%xmm0, %[[X]]
+; CHECK-NEXT: movaps	%[[X]], %xmm0
+; CHECK-NEXT: retq
+        %tmp = fsub <4 x float> zeroinitializer, %Q
+	ret <4 x float> %tmp
+}
+
+; If we're bitcasting an integer to an FP vector, we should avoid the FPU/vector unit entirely.
+; Make sure that we're flipping the sign bit and only the sign bit of each float.
+; So instead of something like this:
+;    movd	%rdi, %xmm0
+;    xorps	.LCPI2_0(%rip), %xmm0
+;
+; We should generate:
+;    movabsq     (put sign bit mask in integer register))
+;    xorq        (flip sign bits)
+;    movd        (move to xmm return register) 
+
+define <2 x float> @fneg_bitcast(i64 %i) {
+; CHECK-LABEL: fneg_bitcast:
+; CHECK:	movabsq	$-9223372034707292160, %rax # imm = 0x8000000080000000
+; CHECK-NEXT:	xorq	%rdi, %rax
+; CHECK-NEXT:	movd	%rax, %xmm0
+; CHECK-NEXT:	retq
+  %bitcast = bitcast i64 %i to <2 x float>
+  %fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast
+  ret <2 x float> %fneg
 }
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 7ec07ae..b882a5e 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -3,6 +3,8 @@
 
 ; PR11674
 define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
+; CHECK-LABEL: fpext_frommem:
+; AVX-LABEL: fpext_frommem:
 entry:
 ; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
 ; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
@@ -13,6 +15,8 @@ entry:
 }
 
 define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
+; CHECK-LABEL: fpext_frommem4:
+; AVX-LABEL: fpext_frommem4:
 entry:
 ; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
 ; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
@@ -24,6 +28,8 @@ entry:
 }
 
 define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
+; CHECK-LABEL: fpext_frommem8:
+; AVX-LABEL: fpext_frommem8:
 entry:
 ; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
 ; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index 5cb9f69..b72044a 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -2,66 +2,87 @@
 ; There are no MMX operations in @t1
 
 define void  @t1(i32 %a, x86_mmx* %P) nounwind {
-       %tmp12 = shl i32 %a, 12
-       %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
-       %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
-       %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx
-       store x86_mmx %tmp23, x86_mmx* %P
-       ret void
-
 ; CHECK-LABEL: t1:
-; CHECK-NOT: %mm
-; CHECK: shll $12
-; CHECK-NOT: %mm
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    shll $12, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1]
+; CHECK-NEXT:    movlpd %xmm0, (%eax)
+; CHECK-NEXT:    retl
+ %tmp12 = shl i32 %a, 12
+ %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
+ %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
+ %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx
+ store x86_mmx %tmp23, x86_mmx* %P
+ ret void
 }
 
 define <4 x float> @t2(<4 x float>* %P) nounwind {
-        %tmp1 = load <4 x float>* %P
-        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
-        ret <4 x float> %tmp2
-
 ; CHECK-LABEL: t2:
-; CHECK: pslldq $12
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movaps (%eax), %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; CHECK-NEXT:    retl
+  %tmp1 = load <4 x float>* %P
+  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
+  ret <4 x float> %tmp2
 }
 
 define <4 x float> @t3(<4 x float>* %P) nounwind {
-        %tmp1 = load <4 x float>* %P
-        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
-        ret <4 x float> %tmp2
-
 ; CHECK-LABEL: t3:
-; CHECK: psrldq $8
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movaps (%eax), %xmm0
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,0]
+; CHECK-NEXT:    retl
+  %tmp1 = load <4 x float>* %P
+  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
+  ret <4 x float> %tmp2
 }
 
 define <4 x float> @t4(<4 x float>* %P) nounwind {
-        %tmp1 = load <4 x float>* %P
-        %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
-        ret <4 x float> %tmp2
-
 ; CHECK-LABEL: t4:
-; CHECK: psrldq $12
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movaps (%eax), %xmm0
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; CHECK-NEXT:    retl
+  %tmp1 = load <4 x float>* %P
+  %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
+  ret <4 x float> %tmp2
 }
 
 define <16 x i8> @t5(<16 x i8> %x) nounwind {
-        %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
-        ret <16 x i8> %s
-
 ; CHECK-LABEL: t5:
-; CHECK: psrldq $1
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT:    retl
+  %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
+  ret <16 x i8> %s
 }
 
 define <16 x i8> @t6(<16 x i8> %x) nounwind {
-        %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-        ret <16 x i8> %s
-
 ; CHECK-LABEL: t6:
-; CHECK: palignr $1
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT:    retl
+  %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
 }
 
 define <16 x i8> @t7(<16 x i8> %x) nounwind {
-        %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2>
-        ret <16 x i8> %s
-
 ; CHECK-LABEL: t7:
-; CHECK: pslldq $13
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; CHECK-NEXT:    retl
+  %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2>
+  ret <16 x i8> %s
 }
diff --git a/test/CodeGen/X86/vec_insert-6.ll b/test/CodeGen/X86/vec_insert-6.ll
deleted file mode 100644
index 4583e19..0000000
--- a/test/CodeGen/X86/vec_insert-6.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | grep pslldq
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -mtriple=i686-apple-darwin9 -o /dev/null -stats -info-output-file - | grep asm-printer | grep 6
-
-define <4 x float> @t3(<4 x float>* %P) nounwind  {
-	%tmp1 = load <4 x float>* %P
-	%tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
-	ret <4 x float> %tmp2
-}
diff --git a/test/CodeGen/X86/vec_insert.ll b/test/CodeGen/X86/vec_insert.ll
deleted file mode 100644
index 0ed8f10..0000000
--- a/test/CodeGen/X86/vec_insert.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | grep movss | count 1
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | not grep pinsrw
-
-define void @test(<4 x float>* %F, i32 %I) nounwind {
-	%tmp = load <4 x float>* %F		; <<4 x float>> [#uses=1]
-	%f = sitofp i32 %I to float		; <float> [#uses=1]
-	%tmp1 = insertelement <4 x float> %tmp, float %f, i32 0		; <<4 x float>> [#uses=2]
-	%tmp18 = fadd <4 x float> %tmp1, %tmp1		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp18, <4 x float>* %F
-	ret void
-}
-
-define void @test2(<4 x float>* %F, i32 %I, float %g) nounwind {
-	%tmp = load <4 x float>* %F		; <<4 x float>> [#uses=1]
-	%f = sitofp i32 %I to float		; <float> [#uses=1]
-	%tmp1 = insertelement <4 x float> %tmp, float %f, i32 2		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp1, <4 x float>* %F
-	ret void
-}
diff --git a/test/CodeGen/X86/vec_return.ll b/test/CodeGen/X86/vec_return.ll
index 2cf5dc6..f7fcd03 100644
--- a/test/CodeGen/X86/vec_return.ll
+++ b/test/CodeGen/X86/vec_return.ll
@@ -10,7 +10,7 @@ define <2 x double> @test() {
 ; Prefer a constant pool load here.
 ; CHECK: test2
 ; CHECK-NOT: shuf
-; CHECK: movaps {{.*}}CPI
+; CHECK: movaps {{.*}}{{CPI|__xmm@}}
 define <4 x i32> @test2() nounwind  {
 	ret <4 x i32> < i32 0, i32 0, i32 1, i32 0 >
 }
diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll
index d1d7608..a13c813 100644
--- a/test/CodeGen/X86/vec_set-3.ll
+++ b/test/CodeGen/X86/vec_set-3.ll
@@ -1,17 +1,37 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -o %t
-; RUN: grep pshufd %t | count 2
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | FileCheck %s
 
-define <4 x float> @test(float %a) nounwind {
-        %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1               ; <<4 x float>> [#uses=1]
-        %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2               ; <<4 x float>> [#uses=1]
-        %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3              ; <<4 x float>> [#uses=1]
-        ret <4 x float> %tmp6
+define <4 x float> @test(float %a) {
+; CHECK-LABEL: test:
+; CHECK:         insertps $29, {{.*}}, %xmm0
+; CHECK-NEXT:    retl
+
+entry:
+  %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
+  %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2
+  %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3
+  ret <4 x float> %tmp6
 }
 
-define <2 x i64> @test2(i32 %a) nounwind {
-        %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2          ; <<4 x i32>> [#uses=1]
-        %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3             ; <<4 x i32>> [#uses=1]
-        %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>           ; <<2 x i64>> [#uses=1]
-        ret <2 x i64> %tmp10
+define <2 x i64> @test2(i32 %a) {
+; CHECK-LABEL: test2:
+; CHECK:         movd {{.*}}, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; CHECK-NEXT:    retl
+
+entry:
+  %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2
+  %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3
+  %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>
+  ret <2 x i64> %tmp10
 }
 
+define <4 x float> @test3(<4 x float> %A) {
+; CHECK-LABEL: test3:
+; CHECK:         insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; CHECK-NEXT:    retl
+
+  %tmp0 = extractelement <4 x float> %A, i32 0
+  %tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1
+  %tmp2 = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 2
+  ret <4 x float> %tmp2
+}
diff --git a/test/CodeGen/X86/vec_set-5.ll b/test/CodeGen/X86/vec_set-5.ll
deleted file mode 100644
index f811a74..0000000
--- a/test/CodeGen/X86/vec_set-5.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
-; RUN: grep movlhps   %t | count 1
-; RUN: grep movq      %t | count 2
-
-define <4 x float> @test1(float %a, float %b) nounwind {
-	%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0		; <<4 x float>> [#uses=1]
-	%tmp6 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
-	%tmp8 = insertelement <4 x float> %tmp6, float %b, i32 2		; <<4 x float>> [#uses=1]
-	%tmp9 = insertelement <4 x float> %tmp8, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
-	ret <4 x float> %tmp9
-}
-
-define <4 x float> @test2(float %a, float %b) nounwind {
-	%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0		; <<4 x float>> [#uses=1]
-	%tmp7 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
-	%tmp8 = insertelement <4 x float> %tmp7, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
-	%tmp9 = insertelement <4 x float> %tmp8, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
-	ret <4 x float> %tmp9
-}
-
-define <2 x i64> @test3(i32 %a, i32 %b) nounwind {
-	%tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0		; <<4 x i32>> [#uses=1]
-	%tmp6 = insertelement <4 x i32> %tmp, i32 %b, i32 1		; <<4 x i32>> [#uses=1]
-	%tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2		; <<4 x i32>> [#uses=1]
-	%tmp10 = insertelement <4 x i32> %tmp8, i32 0, i32 3		; <<4 x i32>> [#uses=1]
-	%tmp11 = bitcast <4 x i32> %tmp10 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	ret <2 x i64> %tmp11
-}
diff --git a/test/CodeGen/X86/vec_set-9.ll b/test/CodeGen/X86/vec_set-9.ll
deleted file mode 100644
index a739090..0000000
--- a/test/CodeGen/X86/vec_set-9.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mattr=-avx,-pad-short-functions | FileCheck %s
-
-; CHECK: test3
-; CHECK: movd
-; CHECK-NOT: movd
-; CHECK: {{movlhps.*%xmm0, %xmm0}}
-; CHECK-NEXT: ret
-
-define <2 x i64> @test3(i64 %A) nounwind {
-entry:
-	%B = insertelement <2 x i64> undef, i64 %A, i32 1
-	ret <2 x i64> %B
-}
-
diff --git a/test/CodeGen/X86/vec_set-E.ll b/test/CodeGen/X86/vec_set-E.ll
deleted file mode 100644
index d78be66..0000000
--- a/test/CodeGen/X86/vec_set-E.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movq
-
-define <4 x float> @t(float %X) nounwind  {
-	%tmp11 = insertelement <4 x float> undef, float %X, i32 0
-	%tmp12 = insertelement <4 x float> %tmp11, float %X, i32 1
-	%tmp27 = insertelement <4 x float> %tmp12, float 0.000000e+00, i32 2
-	%tmp28 = insertelement <4 x float> %tmp27, float 0.000000e+00, i32 3
-	ret <4 x float> %tmp28
-}
diff --git a/test/CodeGen/X86/vec_set-G.ll b/test/CodeGen/X86/vec_set-G.ll
deleted file mode 100644
index 4a542fe..0000000
--- a/test/CodeGen/X86/vec_set-G.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss
-
-define fastcc void @t(<4 x float> %A) nounwind  {
-	%tmp41896 = extractelement <4 x float> %A, i32 0		; <float> [#uses=1]
-	%tmp14082 = insertelement <4 x float> < float 0.000000e+00, float undef, float undef, float undef >, float %tmp41896, i32 1		; <<4 x float>> [#uses=1]
-	%tmp14083 = insertelement <4 x float> %tmp14082, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp14083, <4 x float>* null, align 16
-        ret void
-}
diff --git a/test/CodeGen/X86/vec_set-I.ll b/test/CodeGen/X86/vec_set-I.ll
deleted file mode 100644
index c5d6ab8..0000000
--- a/test/CodeGen/X86/vec_set-I.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
-
-; CHECK-NOT: xorp
-; CHECK: movd
-; CHECK-NOT: xorp
-
-define void @t1() nounwind  {
-	%tmp298.i.i = load <4 x float>* null, align 16
-	%tmp304.i.i = bitcast <4 x float> %tmp298.i.i to <4 x i32>
-	%tmp305.i.i = and <4 x i32> %tmp304.i.i, < i32 -1, i32 0, i32 0, i32 0 >
-	store <4 x i32> %tmp305.i.i, <4 x i32>* null, align 16
-	unreachable
-}
diff --git a/test/CodeGen/X86/vec_set-J.ll b/test/CodeGen/X86/vec_set-J.ll
deleted file mode 100644
index d90ab85..0000000
--- a/test/CodeGen/X86/vec_set-J.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss
-; PR2472
-
-define <4 x i32> @a(<4 x i32> %a) nounwind {
-entry:
-        %vecext = extractelement <4 x i32> %a, i32 0
-        insertelement <4 x i32> zeroinitializer, i32 %vecext, i32 0
-        %add = add <4 x i32> %a, %0
-        ret <4 x i32> %add
-}
diff --git a/test/CodeGen/X86/vec_setcc.ll b/test/CodeGen/X86/vec_setcc.ll
index 322dbae..b69f90c 100644
--- a/test/CodeGen/X86/vec_setcc.ll
+++ b/test/CodeGen/X86/vec_setcc.ll
@@ -62,8 +62,7 @@ define <8 x i16> @v8i16_icmp_ule(<8 x i16> %a, <8 x i16> %b) nounwind readnone s
 ; SSE2-LABEL: v8i16_icmp_ule:
 ; SSE2: psubusw %xmm1, %xmm0
 ; SSE2: pxor    %xmm1, %xmm1
-; SSE2: pcmpeqw %xmm0, %xmm1
-; SSE2: movdqa  %xmm1, %xmm0
+; SSE2: pcmpeqw %xmm1, %xmm0
 
 ; SSE41-LABEL: v8i16_icmp_ule:
 ; SSE41: pminuw  %xmm0, %xmm1
@@ -106,8 +105,7 @@ define <4 x i32> @v4i32_icmp_ule(<4 x i32> %a, <4 x i32> %b) nounwind readnone s
 ; SSE2: pxor    %xmm2, %xmm0
 ; SSE2: pcmpgtd %xmm1, %xmm0
 ; SSE2: pcmpeqd %xmm1, %xmm1
-; SSE2: pxor    %xmm0, %xmm1
-; SSE2: movdqa  %xmm1, %xmm0
+; SSE2: pxor    %xmm1, %xmm0
 
 ; SSE41-LABEL: v4i32_icmp_ule:
 ; SSE41: pminud  %xmm0, %xmm1
diff --git a/test/CodeGen/X86/vec_sext.ll b/test/CodeGen/X86/vec_sext.ll
deleted file mode 100644
index 776ddec..0000000
--- a/test/CodeGen/X86/vec_sext.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -march=x86-64
-; PR 9267
-
-define<4 x i32> @func_16_32() {
-  %F = load <4 x i16>* undef
-  %G = sext <4 x i16> %F to <4 x i32>
-  %H = load <4 x i16>* undef
-  %Y = sext <4 x i16> %H to <4 x i32>
-  %T = add <4 x i32> %Y, %G
-  store <4 x i32>%T , <4 x i32>* undef
-  ret <4 x i32> %T
-}
-
-define<4 x i64> @func_16_64() {
-  %F = load <4 x i16>* undef
-  %G = sext <4 x i16> %F to <4 x i64>
-  %H = load <4 x i16>* undef
-  %Y = sext <4 x i16> %H to <4 x i64>
-  %T = xor <4 x i64> %Y, %G
-  store <4 x i64>%T , <4 x i64>* undef
-  ret <4 x i64> %T
-}
-
-define<4 x i64> @func_32_64() {
-  %F = load <4 x i32>* undef
-  %G = sext <4 x i32> %F to <4 x i64>
-  %H = load <4 x i32>* undef
-  %Y = sext <4 x i32> %H to <4 x i64>
-  %T = or <4 x i64> %Y, %G
-  ret <4 x i64> %T
-}
-
-define<4 x i16> @func_8_16() {
-  %F = load <4 x i8>* undef
-  %G = sext <4 x i8> %F to <4 x i16>
-  %H = load <4 x i8>* undef
-  %Y = sext <4 x i8> %H to <4 x i16>
-  %T = add <4 x i16> %Y, %G
-  ret <4 x i16> %T
-}
-
-define<4 x i32> @func_8_32() {
-  %F = load <4 x i8>* undef
-  %G = sext <4 x i8> %F to <4 x i32>
-  %H = load <4 x i8>* undef
-  %Y = sext <4 x i8> %H to <4 x i32>
-  %T = sub <4 x i32> %Y, %G
-  ret <4 x i32> %T
-}
-
-define<4 x i64> @func_8_64() {
-  %F = load <4 x i8>* undef
-  %G = sext <4 x i8> %F to <4 x i64>
-  %H = load <4 x i8>* undef
-  %Y = sext <4 x i8> %H to <4 x i64>
-  %T = add <4 x i64> %Y, %G
-  ret <4 x i64> %T
-}
-
-define<4 x i32> @const_16_32() {
-  %G = sext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i32>
-  ret <4 x i32> %G
-}
-
-define<4 x i64> @const_16_64() {
-  %G = sext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i64>
-  ret <4 x i64> %G
-}
-
diff --git a/test/CodeGen/X86/vec_shuffle-11.ll b/test/CodeGen/X86/vec_shuffle-11.ll
deleted file mode 100644
index 640745a..0000000
--- a/test/CodeGen/X86/vec_shuffle-11.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | not grep mov
-
-define <4 x i32> @test() nounwind {
-        %tmp131 = call <2 x i64> @llvm.x86.sse2.psrl.dq( <2 x i64> < i64 -1, i64 -1 >, i32 96 )         ; <<2 x i64>> [#uses=1]
-        %tmp137 = bitcast <2 x i64> %tmp131 to <4 x i32>                ; <<4 x i32>> [#uses=1]
-        %tmp138 = and <4 x i32> %tmp137, bitcast (<2 x i64> < i64 -1, i64 -1 > to <4 x i32>)            ; <<4 x i32>> [#uses=1]
-        ret <4 x i32> %tmp138
-}
-
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32)
diff --git a/test/CodeGen/X86/vec_shuffle-14.ll b/test/CodeGen/X86/vec_shuffle-14.ll
deleted file mode 100644
index 8f25197..0000000
--- a/test/CodeGen/X86/vec_shuffle-14.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-32
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-64
-
-define <4 x i32> @t1(i32 %a) nounwind  {
-entry:
-        %tmp = insertelement <4 x i32> undef, i32 %a, i32 0
-	%tmp6 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> < i32 4, i32 1, i32 2, i32 3 >		; <<4 x i32>> [#uses=1]
-	ret <4 x i32> %tmp6
-
-; X86-32-LABEL: t1:
-; X86-32: movd	4(%esp), %xmm0
-
-; X86-64-LABEL: t1:
-; X86-64: movd	%e{{..}}, %xmm0
-}
-
-define <2 x i64> @t2(i64 %a) nounwind  {
-entry:
-        %tmp = insertelement <2 x i64> undef, i64 %a, i32 0
-	%tmp6 = shufflevector <2 x i64> zeroinitializer, <2 x i64> %tmp, <2 x i32> < i32 2, i32 1 >		; <<4 x i32>> [#uses=1]
-	ret <2 x i64> %tmp6
-
-; X86-32-LABEL: t2:
-; X86-32: movq	4(%esp), %xmm0
-
-; X86-64-LABEL: t2:
-; X86-64: movd	%r{{..}}, %xmm0
-}
-
-define <2 x i64> @t3(<2 x i64>* %a) nounwind  {
-entry:
-	%tmp4 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
-	%tmp6 = bitcast <2 x i64> %tmp4 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%tmp7 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp6, <4 x i32> < i32 4, i32 5, i32 2, i32 3 >		; <<4 x i32>> [#uses=1]
-	%tmp8 = bitcast <4 x i32> %tmp7 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	ret <2 x i64> %tmp8
-
-; X86-32-LABEL: t3:
-; X86-32: movl	4(%esp)
-; X86-32: movq
-
-; X86-64-LABEL: t3:
-; X86-64: movq	({{.*}}), %xmm0
-}
-
-define <2 x i64> @t4(<2 x i64> %a) nounwind  {
-entry:
-	%tmp5 = bitcast <2 x i64> %a to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%tmp6 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp5, <4 x i32> < i32 4, i32 5, i32 2, i32 3 >		; <<4 x i32>> [#uses=1]
-	%tmp7 = bitcast <4 x i32> %tmp6 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	ret <2 x i64> %tmp7
-
-; X86-32-LABEL: t4:
-; X86-32: movq %xmm0, %xmm0
-
-; X86-64-LABEL: t4:
-; X86-64: movq {{.*}}, %xmm0
-}
-
-define <2 x i64> @t5(<2 x i64> %a) nounwind  {
-entry:
-	%tmp6 = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <2 x i32> < i32 2, i32 1 >		; <<4 x i32>> [#uses=1]
-	ret <2 x i64> %tmp6
-
-; X86-32-LABEL: t5:
-; X86-32: movq %xmm0, %xmm0
-
-; X86-64-LABEL: t5:
-; X86-64: movq {{.*}}, %xmm0
-}
diff --git a/test/CodeGen/X86/vec_shuffle-15.ll b/test/CodeGen/X86/vec_shuffle-15.ll
deleted file mode 100644
index 5a9b8fd..0000000
--- a/test/CodeGen/X86/vec_shuffle-15.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
-
-define <2 x i64> @t00(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 0 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t01(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 1 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t02(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 2 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t03(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 3 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t10(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 0 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t11(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 1 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t12(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 2 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t13(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 3 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t20(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 0 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t21(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 1 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t22(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 2 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t23(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 3 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t30(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 0 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t31(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 1 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t32(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 2 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t33(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 3 >
-	ret <2 x i64> %tmp
-}
diff --git a/test/CodeGen/X86/vec_shuffle-16.ll b/test/CodeGen/X86/vec_shuffle-16.ll
deleted file mode 100644
index 9aeb942..0000000
--- a/test/CodeGen/X86/vec_shuffle-16.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2
-
-; sse-LABEL:  t1:
-; sse2-LABEL: t1:
-define <4 x float> @t1(<4 x float> %a, <4 x float> %b) nounwind  {
-; sse: shufps
-; sse2: pshufd
-; sse2-NEXT: ret
-        %tmp1 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
-        ret <4 x float> %tmp1
-}
-
-; sse-LABEL:  t2:
-; sse2-LABEL: t2:
-define <4 x float> @t2(<4 x float> %A, <4 x float> %B) nounwind {
-; sse: shufps
-; sse2: pshufd
-; sse2-NEXT: ret
-	%tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 3, i32 3, i32 3, i32 3 >
-	ret <4 x float> %tmp
-}
-
-; sse-LABEL:  t3:
-; sse2-LABEL: t3:
-define <4 x float> @t3(<4 x float> %A, <4 x float> %B) nounwind {
-; sse: shufps
-; sse2: pshufd
-; sse2-NEXT: ret
-	%tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 4, i32 4, i32 4, i32 4 >
-	ret <4 x float> %tmp
-}
-
-; sse-LABEL:  t4:
-; sse2-LABEL: t4:
-define <4 x float> @t4(<4 x float> %A, <4 x float> %B) nounwind {
-
-; sse: shufps
-; sse2: pshufd
-; sse2-NEXT: ret
-	%tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 1, i32 3, i32 2, i32 0 >
-	ret <4 x float> %tmp
-}
diff --git a/test/CodeGen/X86/vec_shuffle-17.ll b/test/CodeGen/X86/vec_shuffle-17.ll
deleted file mode 100644
index f2f96ba..0000000
--- a/test/CodeGen/X86/vec_shuffle-17.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=-avx | FileCheck %s
-; CHECK-NOT: xor
-; CHECK: movd {{%rdi|%rcx}}, %xmm0
-; CHECK-NOT: xor
-; PR2108
-
-define <2 x i64> @doload64(i64 %x) nounwind  {
-entry:
-	%tmp717 = bitcast i64 %x to double		; <double> [#uses=1]
-	%tmp8 = insertelement <2 x double> undef, double %tmp717, i32 0		; <<2 x double>> [#uses=1]
-	%tmp9 = insertelement <2 x double> %tmp8, double 0.000000e+00, i32 1		; <<2 x double>> [#uses=1]
-	%tmp11 = bitcast <2 x double> %tmp9 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	ret <2 x i64> %tmp11
-}
-
diff --git a/test/CodeGen/X86/vec_shuffle-18.ll b/test/CodeGen/X86/vec_shuffle-18.ll
deleted file mode 100644
index 1104a4a..0000000
--- a/test/CodeGen/X86/vec_shuffle-18.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8.8.0 | grep mov | count 7
-
-	%struct.vector4_t = type { <4 x float> }
-
-define void @swizzle(i8* %a, %struct.vector4_t* %b, %struct.vector4_t* %c) nounwind  {
-entry:
-	%tmp9 = getelementptr %struct.vector4_t* %b, i32 0, i32 0		; <<4 x float>*> [#uses=2]
-	%tmp10 = load <4 x float>* %tmp9, align 16		; <<4 x float>> [#uses=1]
-	%tmp14 = bitcast i8* %a to double*		; <double*> [#uses=1]
-	%tmp15 = load double* %tmp14		; <double> [#uses=1]
-	%tmp16 = insertelement <2 x double> undef, double %tmp15, i32 0		; <<2 x double>> [#uses=1]
-	%tmp18 = bitcast <2 x double> %tmp16 to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp19 = shufflevector <4 x float> %tmp10, <4 x float> %tmp18, <4 x i32> < i32 4, i32 5, i32 2, i32 3 >		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp19, <4 x float>* %tmp9, align 16
-	%tmp28 = getelementptr %struct.vector4_t* %c, i32 0, i32 0		; <<4 x float>*> [#uses=2]
-	%tmp29 = load <4 x float>* %tmp28, align 16		; <<4 x float>> [#uses=1]
-	%tmp26 = getelementptr i8* %a, i32 8		; <i8*> [#uses=1]
-	%tmp33 = bitcast i8* %tmp26 to double*		; <double*> [#uses=1]
-	%tmp34 = load double* %tmp33		; <double> [#uses=1]
-	%tmp35 = insertelement <2 x double> undef, double %tmp34, i32 0		; <<2 x double>> [#uses=1]
-	%tmp37 = bitcast <2 x double> %tmp35 to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp38 = shufflevector <4 x float> %tmp29, <4 x float> %tmp37, <4 x i32> < i32 4, i32 5, i32 2, i32 3 >		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp38, <4 x float>* %tmp28, align 16
-	ret void
-}
diff --git a/test/CodeGen/X86/vec_shuffle-19.ll b/test/CodeGen/X86/vec_shuffle-19.ll
deleted file mode 100644
index 48db8de..0000000
--- a/test/CodeGen/X86/vec_shuffle-19.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -o /dev/null -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 4
-; PR2485
-
-define <4 x i32> @t(<4 x i32> %a, <4 x i32> %b) nounwind  {
-entry:
-	%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> < i32 4, i32 0, i32 0, i32 0 >		; <<4 x i32>> [#uses=1]
-	ret <4 x i32> %shuffle
-}
diff --git a/test/CodeGen/X86/vec_shuffle-20.ll b/test/CodeGen/X86/vec_shuffle-20.ll
deleted file mode 100644
index 5a2c444..0000000
--- a/test/CodeGen/X86/vec_shuffle-20.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 2
-
-define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) nounwind  {
-entry:
-	shufflevector <4 x float> %fp0, <4 x float> %fp1, <4 x i32> < i32 0, i32 1, i32 2, i32 7 >		; <<4 x float>>:0 [#uses=1]
-	ret <4 x float> %0
-}
diff --git a/test/CodeGen/X86/vec_shuffle-22.ll b/test/CodeGen/X86/vec_shuffle-22.ll
deleted file mode 100644
index 6807e4d..0000000
--- a/test/CodeGen/X86/vec_shuffle-22.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium-m  | FileCheck %s
-
-define <4 x float> @t1(<4 x float> %a) nounwind  {
-; CHECK: movlhps
-  %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 >       ; <<4 x float>> [#uses=1]
-  ret <4 x float> %tmp1
-}
-
-define <4 x i32> @t2(<4 x i32>* %a) nounwind {
-; CHECK: pshufd
-; CHECK: ret
-  %tmp1 = load <4 x i32>* %a
-	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 >		; <<4 x i32>> [#uses=1]
-	ret <4 x i32> %tmp2
-}
diff --git a/test/CodeGen/X86/vec_shuffle-23.ll b/test/CodeGen/X86/vec_shuffle-23.ll
deleted file mode 100644
index 2468735..0000000
--- a/test/CodeGen/X86/vec_shuffle-23.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2                | not grep punpck
-; RUN: llc < %s -march=x86 -mattr=+sse2                |     grep pshufd
-
-define i32 @t() nounwind {
-entry:
-	%a = alloca <4 x i32>		; <<4 x i32>*> [#uses=2]
-	%b = alloca <4 x i32>		; <<4 x i32>*> [#uses=5]
-	store volatile <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a
-	%tmp = load <4 x i32>* %a		; <<4 x i32>> [#uses=1]
-	store <4 x i32> %tmp, <4 x i32>* %b
-	%tmp1 = load <4 x i32>* %b		; <<4 x i32>> [#uses=1]
-	%tmp2 = load <4 x i32>* %b		; <<4 x i32>> [#uses=1]
-	%punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x i32>> [#uses=1]
-	store <4 x i32> %punpckldq, <4 x i32>* %b
-	%tmp3 = load <4 x i32>* %b		; <<4 x i32>> [#uses=1]
-	%result = extractelement <4 x i32> %tmp3, i32 0		; <i32> [#uses=1]
-	ret i32 %result
-}
diff --git a/test/CodeGen/X86/vec_shuffle-24.ll b/test/CodeGen/X86/vec_shuffle-24.ll
deleted file mode 100644
index d038daf..0000000
--- a/test/CodeGen/X86/vec_shuffle-24.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
-
-define i32 @t() nounwind optsize {
-entry:
-; CHECK: punpckldq
-	%a = alloca <4 x i32>		; <<4 x i32>*> [#uses=2]
-	%b = alloca <4 x i32>		; <<4 x i32>*> [#uses=5]
-	store volatile <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a
-	%tmp = load <4 x i32>* %a		; <<4 x i32>> [#uses=1]
-	store <4 x i32> %tmp, <4 x i32>* %b
-	%tmp1 = load <4 x i32>* %b		; <<4 x i32>> [#uses=1]
-	%tmp2 = load <4 x i32>* %b		; <<4 x i32>> [#uses=1]
-	%punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x i32>> [#uses=1]
-	store <4 x i32> %punpckldq, <4 x i32>* %b
-	%tmp3 = load <4 x i32>* %b		; <<4 x i32>> [#uses=1]
-	%result = extractelement <4 x i32> %tmp3, i32 0		; <i32> [#uses=1]
-	ret i32 %result
-}
diff --git a/test/CodeGen/X86/vec_shuffle-25.ll b/test/CodeGen/X86/vec_shuffle-25.ll
deleted file mode 100644
index 3f42a13..0000000
--- a/test/CodeGen/X86/vec_shuffle-25.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=sse4.1 -o %t
-; RUN: grep unpcklps %t | count 3
-; RUN: grep unpckhps %t | count 1
- 
-; Transpose example using the more generic vector shuffle.  We return
-; float8 instead of float16 since x86 can return that in register.
-; ModuleID = 'transpose2_opt.bc'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i386-apple-cl.1.0"
-@r0 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-@r1 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-@r2 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-@r3 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-
-define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
-entry:
-	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
-	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
-	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
-	%unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
-	%unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
-	%unpcklps14a = shufflevector <4 x float> %unpcklps14,  <4 x float> undef,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	%unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
-	%unpckhps17a = shufflevector <4 x float> %unpckhps17,  <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	%r1 = shufflevector <16 x float> %unpcklps14a,  <16 x float> %unpckhps17a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-	%unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
-	%unpcklps20a = shufflevector <4 x float> %unpcklps20,  <4 x float> undef,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	%r2 = shufflevector <16 x float> %r1,  <16 x float> %unpcklps20a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-	%unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
-	%unpckhps23a = shufflevector <4 x float> %unpckhps23,  <4 x float> undef,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	%r3 = shufflevector <16 x float> %r2,  <16 x float> %unpckhps23a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-	%r4 = shufflevector <16 x float> %r3,  <16 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-	ret <8 x float> %r4
-}
diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
deleted file mode 100644
index 00e8e73..0000000
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse4.1 | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
-
-; Transpose example using the more generic vector shuffle. Return float8
-; instead of float16
-; ModuleID = 'transpose2_opt.bc'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i386-apple-cl.1.0"
-@r0 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-@r1 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-@r2 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-@r3 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-
-define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
-entry:
-; CHECK: transpose2
-; CHECK: unpckhps
-; CHECK: unpckhps
-; CHECK: unpcklps
-; CHECK: unpckhps
-; Different instruction order for Atom.
-; ATOM: transpose2
-; ATOM: unpckhps
-; ATOM: unpckhps
-; ATOM: unpckhps
-; ATOM: unpcklps
-	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
-	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
-	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
-	%unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
-	%unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
-	%unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
-        %r1 = shufflevector <4 x float> %unpcklps14,  <4 x float> %unpckhps17,  <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
-	%unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
-	%unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
-        %r2 = shufflevector <4 x float> %unpcklps20,  <4 x float> %unpckhps23,  <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
-;       %r3 = shufflevector <8 x float> %r1,  <8 x float> %r2,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; 
-	ret <8 x float> %r2
-}
-
-define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind {
-entry:
-; movhps should happen before extractps to assure it gets the correct value.
-; CHECK: lo_hi_shift
-; CHECK: movhps ([[BASEREG:%[a-z]+]]),
-; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
-; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
-; ATOM: lo_hi_shift
-; ATOM: movhps ([[BASEREG:%[a-z]+]]),
-; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
-; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
-  %v.i = bitcast float* %y to <4 x float>*
-  %0 = load <4 x float>* %v.i, align 1
-  %1 = bitcast float* %x to <1 x i64>*
-  %.val = load <1 x i64>* %1, align 1
-  %2 = bitcast <1 x i64> %.val to <2 x float>
-  %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  %cast.i = bitcast <4 x float> %0 to <2 x i64>
-  %extract.i = extractelement <2 x i64> %cast.i, i32 1
-  %3 = bitcast float* %x to i64*
-  store i64 %extract.i, i64* %3, align 4
-  %4 = bitcast <4 x float> %0 to <16 x i8>
-  %5 = bitcast <4 x float> %shuffle1.i to <16 x i8>
-  %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  %6 = bitcast <16 x i8> %palignr to <2 x i64>
-  ret <2 x i64> %6
-}
diff --git a/test/CodeGen/X86/vec_shuffle-27.ll b/test/CodeGen/X86/vec_shuffle-27.ll
deleted file mode 100644
index c9b2fb5..0000000
--- a/test/CodeGen/X86/vec_shuffle-27.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
-
-; ModuleID = 'vec_shuffle-27.bc'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i686-apple-cl.1.0"
-
-define <8 x float> @my2filter4_1d(<4 x float> %a, <8 x float> %T0, <8 x float> %T1) nounwind readnone {
-entry:
-; CHECK: subps
-; CHECK: subps
-; CHECK: mulps
-; CHECK: mulps
-; CHECK: addps
-; CHECK: addps
-	%tmp7 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3 >		; <<8 x float>> [#uses=1]
-	%sub = fsub <8 x float> %T1, %T0		; <<8 x float>> [#uses=1]
-	%mul = fmul <8 x float> %sub, %tmp7		; <<8 x float>> [#uses=1]
-	%add = fadd <8 x float> %mul, %T0		; <<8 x float>> [#uses=1]
-	ret <8 x float> %add
-}
-
-; Test case for r122206
-define void @test2(<4 x i64>* %ap, <4 x i64>* %bp) nounwind {
-entry:
-; CHECK: movdqa
-  %a = load <4 x i64> * %ap
-  %b = load <4 x i64> * %bp
-  %mulaa = mul <4 x i64> %a, %a
-  %mulbb = mul <4 x i64> %b, %b
-  %mulab = mul <4 x i64> %a, %b
-  %vect1271 = shufflevector <4 x i64> %mulaa, <4 x i64> %mulbb, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
-  %vect1272 = shufflevector <4 x i64> %mulaa, <4 x i64> %mulbb, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
-  %vect1487 = shufflevector <4 x i64> %vect1271, <4 x i64> %mulab, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-  %vect1488 = shufflevector <4 x i64> %vect1272, <4 x i64> %mulab, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
-  store <4 x i64> %vect1487, <4 x i64>* %ap
-  store <4 x i64> %vect1488, <4 x i64>* %bp
-  ret void;
-}
diff --git a/test/CodeGen/X86/vec_shuffle-28.ll b/test/CodeGen/X86/vec_shuffle-28.ll
deleted file mode 100644
index ebf5577..0000000
--- a/test/CodeGen/X86/vec_shuffle-28.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
-
-; CHECK:     pshufb
-; CHECK-NOT: pshufb
-
-; FIXME: this test has a superfluous punpcklqdq pre-pshufb currently.
-;        Don't XFAIL it because it's still better than the previous code.
-
-; Pack various elements via shuffles.
-define <8 x i16> @shuf1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
-	ret <8 x i16> %tmp7
-}
diff --git a/test/CodeGen/X86/vec_shuffle-30.ll b/test/CodeGen/X86/vec_shuffle-30.ll
deleted file mode 100644
index f5f8842..0000000
--- a/test/CodeGen/X86/vec_shuffle-30.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
-
-; CHECK: test
-; Test case when creating pshufhw, we incorrectly set the higher order bit
-; for an undef,
-define void @test(<8 x i16>* %dest, <8 x i16> %in) nounwind {
-entry:
-; CHECK-NOT: vmovaps
-; CHECK: vmovlpd
-; CHECK: vpshufhw        $-95
-  %0 = load <8 x i16>* %dest
-  %1 = shufflevector <8 x i16> %0, <8 x i16> %in, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 13, i32 undef, i32 14, i32 14>
-  store <8 x i16> %1, <8 x i16>* %dest
-  ret void
-}
-
-; CHECK: test2
-; A test case where we shouldn't generate a punpckldq but a pshufd and a pslldq
-define void @test2(<4 x i32>* %dest, <4 x i32> %in) nounwind {
-entry:
-; CHECK-NOT: pslldq
-; CHECK: shufps
-  %0 = shufflevector <4 x i32> %in, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> < i32 undef, i32 5, i32 undef, i32 2>
-  store <4 x i32> %0, <4 x i32>* %dest
-  ret void
-}
diff --git a/test/CodeGen/X86/vec_shuffle-31.ll b/test/CodeGen/X86/vec_shuffle-31.ll
deleted file mode 100644
index bb06e15..0000000
--- a/test/CodeGen/X86/vec_shuffle-31.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 -o %t
-; RUN: grep pshufb %t | count 1
-
-define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
-	ret <8 x i16> %tmp9
-}
diff --git a/test/CodeGen/X86/vec_shuffle-34.ll b/test/CodeGen/X86/vec_shuffle-34.ll
deleted file mode 100644
index d057b3f..0000000
--- a/test/CodeGen/X86/vec_shuffle-34.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 | grep pshufb | count 2
-
-define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
-	ret <8 x i16> %tmp8
-}
diff --git a/test/CodeGen/X86/vec_shuffle-35.ll b/test/CodeGen/X86/vec_shuffle-35.ll
deleted file mode 100644
index f5083b4..0000000
--- a/test/CodeGen/X86/vec_shuffle-35.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -stack-alignment=16 -o %t
-; RUN: grep pextrw %t | count 12
-; RUN: grep pinsrw %t | count 13
-; RUN: grep rolw %t | count 13
-; RUN: not grep esp %t
-; RUN: not grep ebp %t
-; RUN: llc < %s -march=x86 -mcpu=core2 -stack-alignment=16 -o %t
-; RUN: grep pshufb %t | count 3
-
-define <16 x i8> @shuf1(<16 x i8> %T0) nounwind readnone {
-entry:
-	%tmp8 = shufflevector <16 x i8> %T0, <16 x i8> undef, <16 x i32> < i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 >
-	ret <16 x i8> %tmp8
-}
-
-define <16 x i8> @shuf2(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-entry:
-	%tmp8 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> < i32 undef, i32 undef, i32 3, i32 2, i32 17, i32 16, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 >
-	ret <16 x i8> %tmp8
-}
diff --git a/test/CodeGen/X86/vec_shuffle-36.ll b/test/CodeGen/X86/vec_shuffle-36.ll
deleted file mode 100644
index f1d0f93..0000000
--- a/test/CodeGen/X86/vec_shuffle-36.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
-
-define <8 x i16> @shuf6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK: ret
-entry:
-  %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 3, i32 2, i32 0, i32 2, i32 1, i32 5, i32 6 , i32 undef >
-  ret <8 x i16> %tmp9
-}
-
-define <8 x i16> @shuf7(<8 x i16> %t0) {
-; CHECK: pshufd
-  %tmp10 = shufflevector <8 x i16> %t0, <8 x i16> undef, <8 x i32> < i32 undef, i32 2, i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef >
-  ret <8 x i16> %tmp10
-}
diff --git a/test/CodeGen/X86/vec_shuffle-37.ll b/test/CodeGen/X86/vec_shuffle-37.ll
deleted file mode 100644
index ed285f9..0000000
--- a/test/CodeGen/X86/vec_shuffle-37.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=core2 | FileCheck %s
-; RUN: llc -O0 < %s -march=x86 -mcpu=core2 | FileCheck %s --check-prefix=CHECK_O0
-
-define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp {
-entry:
-; CHECK: movaps  ({{%rdi|%rcx}}), %[[XMM0:xmm[0-9]+]]
-; CHECK: movaps  %[[XMM0]], %[[XMM1:xmm[0-9]+]]
-; CHECK-NEXT: movss   %xmm{{[0-9]+}}, %[[XMM1]]
-; CHECK-NEXT: shufps  $36, %[[XMM1]], %[[XMM0]]
-  %0 = load <4 x i32>* undef, align 16
-  %1 = load <4 x i32>* %a0, align 16
-  %2 = shufflevector <4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-  ret <4 x i32> %2
-}
-
-define void @t01(double* %a0) nounwind ssp {
-entry:
-; CHECK_O0: movsd (%eax), %xmm0
-; CHECK_O0: unpcklpd  %xmm0, %xmm0
-  %tmp93 = load double* %a0, align 8
-  %vecinit94 = insertelement <2 x double> undef, double %tmp93, i32 1
-  store <2 x double> %vecinit94, <2 x double>* undef
-  ret void
-}
-
-define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline {
-entry:
-; CHECK: t02
-; CHECK: movaps
-; CHECK: shufps
-; CHECK: pshufd
-; CHECK: movq
-; CHECK: ret
-  %0 = bitcast <8 x i32>* %source to <4 x i32>*
-  %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3
-  %tmp2 = load <4 x i32>* %arrayidx, align 16
-  %tmp3 = extractelement <4 x i32> %tmp2, i32 0
-  %tmp5 = insertelement <2 x i32> <i32 undef, i32 0>, i32 %tmp3, i32 0
-  %arrayidx7 = getelementptr inbounds <8 x i32>* %source, i64 1
-  %1 = bitcast <8 x i32>* %arrayidx7 to <4 x i32>*
-  %tmp8 = load <4 x i32>* %1, align 16
-  %tmp9 = extractelement <4 x i32> %tmp8, i32 1
-  %tmp11 = insertelement <2 x i32> %tmp5, i32 %tmp9, i32 1
-  store <2 x i32> %tmp11, <2 x i32>* %dest, align 8
-  ret void
-}
diff --git a/test/CodeGen/X86/vec_shuffle-38.ll b/test/CodeGen/X86/vec_shuffle-38.ll
deleted file mode 100644
index ec196df..0000000
--- a/test/CodeGen/X86/vec_shuffle-38.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
-
-define <2 x double> @ld(<2 x double> %p) nounwind optsize ssp {
-; CHECK: unpcklpd
-  %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> zeroinitializer
-  ret <2 x double> %shuffle
-}
-
-define <2 x double> @hd(<2 x double> %p) nounwind optsize ssp {
-; CHECK: unpckhpd
-  %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x double> %shuffle
-}
-
-define <2 x i64> @ldi(<2 x i64> %p) nounwind optsize ssp {
-; CHECK: punpcklqdq
-  %shuffle = shufflevector <2 x i64> %p, <2 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %shuffle
-}
-
-define <2 x i64> @hdi(<2 x i64> %p) nounwind optsize ssp {
-; CHECK: punpckhqdq
-  %shuffle = shufflevector <2 x i64> %p, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i64> %shuffle
-}
-
-; rdar://10050549
-%struct.Float2 = type { float, float }
-
-define <4 x float> @loadhpi(%struct.Float2* %vPtr, <4 x float> %vecin1) nounwind readonly ssp {
-entry:
-; CHECK: loadhpi
-; CHECK-NOT: movq
-; CHECK: movhps (
-  %tmp1 = bitcast %struct.Float2* %vPtr to <1 x i64>*
-  %addptr7 = getelementptr inbounds <1 x i64>* %tmp1, i64 0
-  %tmp2 = bitcast <1 x i64>* %addptr7 to float*
-  %tmp3 = load float* %tmp2, align 4
-  %vec = insertelement <4 x float> undef, float %tmp3, i32 0
-  %addptr.i12 = getelementptr inbounds float* %tmp2, i64 1
-  %tmp4 = load float* %addptr.i12, align 4
-  %vecin2 = insertelement <4 x float> %vec, float %tmp4, i32 1
-  %shuffle = shufflevector <4 x float> %vecin1, <4 x float> %vecin2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  ret <4 x float> %shuffle
-}
-
-; rdar://10119696
-; CHECK: f
-define <4 x float> @f(<4 x float> %x, double* nocapture %y) nounwind readonly ssp {
-entry:
-  ; CHECK: movlps  (%{{rdi|rdx}}), %xmm0
-  %u110.i = load double* %y, align 1
-  %tmp8.i = insertelement <2 x double> undef, double %u110.i, i32 0
-  %tmp9.i = bitcast <2 x double> %tmp8.i to <4 x float>
-  %shuffle.i = shufflevector <4 x float> %x, <4 x float> %tmp9.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-  ret <4 x float> %shuffle.i
-}
-
-define <4 x float> @loadhpi2(%struct.Float2* nocapture %vHiCoefPtr_0, %struct.Float2* nocapture %vLoCoefPtr_0, i32 %s) nounwind readonly ssp {
-entry:
-; CHECK: loadhpi2
-; CHECK: movhps (
-; CHECK-NOT: movlhps
-  %0 = bitcast %struct.Float2* %vHiCoefPtr_0 to <1 x i64>*
-  %idx.ext = sext i32 %s to i64
-  %add.ptr = getelementptr inbounds <1 x i64>* %0, i64 %idx.ext
-  %add.ptr.val = load <1 x i64>* %add.ptr, align 1
-  %1 = bitcast <1 x i64> %add.ptr.val to <2 x float>
-  %shuffle.i = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  %2 = bitcast %struct.Float2* %vLoCoefPtr_0 to <1 x i64>*
-  %add.ptr2 = getelementptr inbounds <1 x i64>* %2, i64 %idx.ext
-  %add.ptr2.val = load <1 x i64>* %add.ptr2, align 1
-  %3 = bitcast <1 x i64> %add.ptr2.val to <2 x float>
-  %shuffle.i4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  %shuffle1.i5 = shufflevector <4 x float> %shuffle.i, <4 x float> %shuffle.i4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  ret <4 x float> %shuffle1.i5
-}
diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll
deleted file mode 100644
index 8fd9a5c..0000000
--- a/test/CodeGen/X86/vec_shuffle-39.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn | FileCheck %s
-; rdar://10050222, rdar://10134392
-
-define <4 x float> @t1(<4 x float> %a, <1 x i64>* nocapture %p) nounwind {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: movlps (%rdi), %xmm0
-; CHECK: ret
-  %p.val = load <1 x i64>* %p, align 1
-  %0 = bitcast <1 x i64> %p.val to <2 x float>
-  %shuffle.i = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  %shuffle1.i = shufflevector <4 x float> %a, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-  ret <4 x float> %shuffle1.i
-}
-
-define <4 x float> @t1a(<4 x float> %a, <1 x i64>* nocapture %p) nounwind {
-entry:
-; CHECK-LABEL: t1a:
-; CHECK: movlps (%rdi), %xmm0
-; CHECK: ret
-  %0 = bitcast <1 x i64>* %p to double*
-  %1 = load double* %0
-  %2 = insertelement <2 x double> undef, double %1, i32 0
-  %3 = bitcast <2 x double> %2 to <4 x float>
-  %4 = shufflevector <4 x float> %a, <4 x float> %3, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-  ret <4 x float> %4
-}
-
-define void @t2(<1 x i64>* nocapture %p, <4 x float> %a) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: movlps %xmm0, (%rdi)
-; CHECK: ret
-  %cast.i = bitcast <4 x float> %a to <2 x i64>
-  %extract.i = extractelement <2 x i64> %cast.i, i32 0
-  %0 = getelementptr inbounds <1 x i64>* %p, i64 0, i64 0
-  store i64 %extract.i, i64* %0, align 8
-  ret void
-}
-
-define void @t2a(<1 x i64>* nocapture %p, <4 x float> %a) nounwind {
-entry:
-; CHECK-LABEL: t2a:
-; CHECK: movlps %xmm0, (%rdi)
-; CHECK: ret
-  %0 = bitcast <1 x i64>* %p to double*
-  %1 = bitcast <4 x float> %a to <2 x double>
-  %2 = extractelement <2 x double> %1, i32 0
-  store double %2, double* %0
-  ret void
-}
-
-; rdar://10436044
-define <2 x double> @t3() nounwind readonly {
-bb:
-; CHECK-LABEL: t3:
-; CHECK: movq (%rax), %xmm1
-; CHECK: punpcklqdq %xmm2, %xmm0
-; CHECK: movsd %xmm1, %xmm0
-  %tmp0 = load i128* null, align 1
-  %tmp1 = load <2 x i32>* undef, align 8
-  %tmp2 = bitcast i128 %tmp0 to <16 x i8>
-  %tmp3 = bitcast <2 x i32> %tmp1 to i64
-  %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0
-  %tmp5 = bitcast <16 x i8> %tmp2 to <2 x double>
-  %tmp6 = bitcast <2 x i64> %tmp4 to <2 x double>
-  %tmp7 = shufflevector <2 x double> %tmp5, <2 x double> %tmp6, <2 x i32> <i32 2, i32 1>
-  ret <2 x double> %tmp7
-}
-
-; rdar://10450317
-define <2 x i64> @t4() nounwind readonly {
-bb:
-; CHECK-LABEL: t4:
-; CHECK: movq (%rax), %xmm0
-; CHECK: punpcklqdq %{{xmm.}}, %[[XMM:xmm[0-9]]]
-; CHECK: movsd %[[XMM]], %xmm0
-  %tmp0 = load i128* null, align 1
-  %tmp1 = load <2 x i32>* undef, align 8
-  %tmp2 = bitcast i128 %tmp0 to <16 x i8>
-  %tmp3 = bitcast <2 x i32> %tmp1 to i64
-  %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0
-  %tmp5 = bitcast <16 x i8> %tmp2 to <2 x i64>
-  %tmp6 = shufflevector <2 x i64> %tmp4, <2 x i64> %tmp5, <2 x i32> <i32 2, i32 1>
-  ret <2 x i64> %tmp6
-}
diff --git a/test/CodeGen/X86/vec_shuffle-40.ll b/test/CodeGen/X86/vec_shuffle-40.ll
deleted file mode 100644
index 75b45e3..0000000
--- a/test/CodeGen/X86/vec_shuffle-40.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
-
-define void @shuffle_v16i16(<16 x i16>* %a) {
-; CHECK-LABEL: shuffle_v16i16:
-; CHECK: vpshufb {{.*}}%ymm
-; CHECK-NOT: vpshufb {{.*}}%xmm
-entry:
-  %0 = load <16 x i16>* %a, align 32
-  %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-  store <16 x i16> %shuffle, <16 x i16>* %a, align 32
-  ret void
-}
-
-define void @shuffle_v16i16_lanecrossing(<16 x i16>* %a) {
-; CHECK-LABEL: shuffle_v16i16_lanecrossing:
-; CHECK-NOT: vpshufb {{.*}}%ymm
-entry:
-  %0 = load <16 x i16>* %a, align 32
-  %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 13, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-  store <16 x i16> %shuffle, <16 x i16>* %a, align 32
-  ret void
-}
diff --git a/test/CodeGen/X86/vec_shuffle-41.ll b/test/CodeGen/X86/vec_shuffle-41.ll
deleted file mode 100644
index 28fdd2f..0000000
--- a/test/CodeGen/X86/vec_shuffle-41.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
-
-; Use buildFromShuffleMostly which allows this to be generated as two 128-bit
-; shuffles and an insert.
-
-; This is the (somewhat questionable) LLVM IR that is generated for:
-;    x8.s0123456 = x8.s1234567;  // x8 is a <8 x float> type
-;    x8.s7 = f;                  // f is float
-
-
-define <8 x float> @test1(<8 x float> %a, float %b) {
-; CHECK-LABEL: test1:
-; CHECK: vinsertps
-; CHECK-NOT: vinsertps
-entry:
-  %shift = shufflevector <8 x float> %a, <8 x float> undef, <7 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %extend = shufflevector <7 x float> %shift, <7 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
-  %insert = insertelement <8 x float> %extend, float %b, i32 7
-
-  ret <8 x float> %insert
-}
diff --git a/test/CodeGen/X86/vec_shuffle.ll b/test/CodeGen/X86/vec_shuffle.ll
deleted file mode 100644
index 6599598..0000000
--- a/test/CodeGen/X86/vec_shuffle.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc < %s -mtriple=i686-linux -mcpu=core2 | FileCheck %s
-
-; CHECK: test_v4sf
-; CHECK: movq 8(%esp)
-; CHECK: pshufd $80
-define void @test_v4sf(<4 x float>* %P, float %X, float %Y) nounwind {
-	%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0		; <<4 x float>> [#uses=1]
-	%tmp2 = insertelement <4 x float> %tmp, float %X, i32 1		; <<4 x float>> [#uses=1]
-	%tmp4 = insertelement <4 x float> %tmp2, float %Y, i32 2		; <<4 x float>> [#uses=1]
-	%tmp6 = insertelement <4 x float> %tmp4, float %Y, i32 3		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp6, <4 x float>* %P
-	ret void
-}
-
-; CHECK: test_v2sd
-; CHECK: movups	8(%esp)
-; CHECK: movaps
-define void @test_v2sd(<2 x double>* %P, double %X, double %Y) nounwind {
-	%tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0		; <<2 x double>> [#uses=1]
-	%tmp2 = insertelement <2 x double> %tmp, double %Y, i32 1		; <<2 x double>> [#uses=1]
-	store <2 x double> %tmp2, <2 x double>* %P
-	ret void
-}
-
-; CHECK: test_v8i16
-; CHECK: pshufhw $-58
-; CHECK: movdqa
-define void @test_v8i16(<2 x i64>* %res, <2 x i64>* %A) nounwind {
-	%tmp = load <2 x i64>* %A		; <<2 x i64>> [#uses=1]
-	%tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16>		; <<8 x i16>> [#uses=8]
-	%tmp.upgrd.2 = extractelement <8 x i16> %tmp.upgrd.1, i32 0		; <i16> [#uses=1]
-	%tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1		; <i16> [#uses=1]
-	%tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2		; <i16> [#uses=1]
-	%tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3		; <i16> [#uses=1]
-	%tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 6		; <i16> [#uses=1]
-	%tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5		; <i16> [#uses=1]
-	%tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 4		; <i16> [#uses=1]
-	%tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7		; <i16> [#uses=1]
-	%tmp8 = insertelement <8 x i16> undef, i16 %tmp.upgrd.2, i32 0		; <<8 x i16>> [#uses=1]
-	%tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1		; <<8 x i16>> [#uses=1]
-	%tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp2, i32 2		; <<8 x i16>> [#uses=1]
-	%tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3		; <<8 x i16>> [#uses=1]
-	%tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp4, i32 4		; <<8 x i16>> [#uses=1]
-	%tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5		; <<8 x i16>> [#uses=1]
-	%tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp6, i32 6		; <<8 x i16>> [#uses=1]
-	%tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7		; <<8 x i16>> [#uses=1]
-	%tmp15.upgrd.3 = bitcast <8 x i16> %tmp15 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	store <2 x i64> %tmp15.upgrd.3, <2 x i64>* %res
-	ret void
-}
diff --git a/test/CodeGen/X86/vec_splat-2.ll b/test/CodeGen/X86/vec_splat-2.ll
deleted file mode 100644
index 9d82f97..0000000
--- a/test/CodeGen/X86/vec_splat-2.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s
-
-define void @test(<2 x i64>* %P, i8 %x) nounwind {
-	%tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0		; <<16 x i8>> [#uses=1]
-	%tmp36 = insertelement <16 x i8> %tmp, i8 %x, i32 1		; <<16 x i8>> [#uses=1]
-	%tmp38 = insertelement <16 x i8> %tmp36, i8 %x, i32 2		; <<16 x i8>> [#uses=1]
-	%tmp40 = insertelement <16 x i8> %tmp38, i8 %x, i32 3		; <<16 x i8>> [#uses=1]
-	%tmp42 = insertelement <16 x i8> %tmp40, i8 %x, i32 4		; <<16 x i8>> [#uses=1]
-	%tmp44 = insertelement <16 x i8> %tmp42, i8 %x, i32 5		; <<16 x i8>> [#uses=1]
-	%tmp46 = insertelement <16 x i8> %tmp44, i8 %x, i32 6		; <<16 x i8>> [#uses=1]
-	%tmp48 = insertelement <16 x i8> %tmp46, i8 %x, i32 7		; <<16 x i8>> [#uses=1]
-	%tmp50 = insertelement <16 x i8> %tmp48, i8 %x, i32 8		; <<16 x i8>> [#uses=1]
-	%tmp52 = insertelement <16 x i8> %tmp50, i8 %x, i32 9		; <<16 x i8>> [#uses=1]
-	%tmp54 = insertelement <16 x i8> %tmp52, i8 %x, i32 10		; <<16 x i8>> [#uses=1]
-	%tmp56 = insertelement <16 x i8> %tmp54, i8 %x, i32 11		; <<16 x i8>> [#uses=1]
-	%tmp58 = insertelement <16 x i8> %tmp56, i8 %x, i32 12		; <<16 x i8>> [#uses=1]
-	%tmp60 = insertelement <16 x i8> %tmp58, i8 %x, i32 13		; <<16 x i8>> [#uses=1]
-	%tmp62 = insertelement <16 x i8> %tmp60, i8 %x, i32 14		; <<16 x i8>> [#uses=1]
-	%tmp64 = insertelement <16 x i8> %tmp62, i8 %x, i32 15		; <<16 x i8>> [#uses=1]
-	%tmp68 = load <2 x i64>* %P		; <<2 x i64>> [#uses=1]
-	%tmp71 = bitcast <2 x i64> %tmp68 to <16 x i8>		; <<16 x i8>> [#uses=1]
-	%tmp73 = add <16 x i8> %tmp71, %tmp64		; <<16 x i8>> [#uses=1]
-	%tmp73.upgrd.1 = bitcast <16 x i8> %tmp73 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	store <2 x i64> %tmp73.upgrd.1, <2 x i64>* %P
-	ret void
-
-; CHECK-LABEL: test:
-; CHECK-NOT: pshufd
-; CHECK: punpcklbw
-; CHECK: punpcklbw
-; CHECK: pshufd $0
-; CHECK-NOT: pshufd
-}
diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll
deleted file mode 100644
index 754cbf4..0000000
--- a/test/CodeGen/X86/vec_splat-3.ll
+++ /dev/null
@@ -1,230 +0,0 @@
-; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
-
-; Splat test for v8i16
-define <8 x i16> @shuf_8i16_0(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_0:
-; CHECK: pshuflw $0
-}
-
-define <8 x i16> @shuf_8i16_1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_1:
-; CHECK: pshuflw $5
-}
-
-define <8 x i16> @shuf_8i16_2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_2:
-; CHECK: punpcklwd
-; CHECK-NEXT: pshufd $-86
-}
-
-define <8 x i16> @shuf_8i16_3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_3:
-; CHECK: pshuflw $15
-}
-
-define <8 x i16> @shuf_8i16_4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_4:
-; CHECK: movhlps
-}
-
-define <8 x i16> @shuf_8i16_5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_5:
-; CHECK: punpckhwd
-; CHECK-NEXT: pshufd $85
-}
-
-define <8 x i16> @shuf_8i16_6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 6, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_6:
-; CHECK: punpckhwd
-; CHECK-NEXT: pshufd $-86
-}
-
-define <8 x i16> @shuf_8i16_7(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_7:
-; CHECK: punpckhwd
-; CHECK-NEXT: pshufd $-1
-}
-
-; Splat test for v16i8
-define <16 x i8> @shuf_16i8_8(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_8:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $0
-}
-
-define <16 x i8> @shuf_16i8_9(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_9:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $85
-}
-
-define <16 x i8> @shuf_16i8_10(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_10:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-86
-}
-
-define <16 x i8> @shuf_16i8_11(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 3, i32 undef, i32 undef, i32 3, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_11:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-1
-}
-
-
-define <16 x i8> @shuf_16i8_12(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_12:
-; CHECK: pshufd $5
-}
-
-define <16 x i8> @shuf_16i8_13(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_13:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $85
-}
-
-define <16 x i8> @shuf_16i8_14(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 6, i32 undef, i32 undef, i32 6, i32 undef, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_14:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-86
-}
-
-define <16 x i8> @shuf_16i8_15(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_15:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-1
-}
-
-define <16 x i8> @shuf_16i8_16(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 8, i32 undef, i32 undef, i32 8, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_16:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $0
-}
-
-define <16 x i8> @shuf_16i8_17(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 9, i32 undef, i32 undef, i32 9, i32 undef, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_17:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $85
-}
-
-define <16 x i8> @shuf_16i8_18(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 10, i32 undef, i32 undef, i32 10, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_18:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-86
-}
-
-define <16 x i8> @shuf_16i8_19(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 11, i32 undef, i32 undef, i32 11, i32 undef, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_19:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-1
-}
-
-define <16 x i8> @shuf_16i8_20(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 12, i32 undef, i32 undef, i32 12, i32 undef, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_20:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $0
-}
-
-define <16 x i8> @shuf_16i8_21(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 13, i32 undef, i32 undef, i32 13, i32 undef, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_21:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $85
-}
-
-define <16 x i8> @shuf_16i8_22(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 14, i32 undef, i32 undef, i32 14, i32 undef, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_22:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-86
-}
-
-define <16 x i8> @shuf_16i8_23(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 15, i32 undef, i32 undef, i32 15, i32 undef, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_23:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-1
-}
diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll
deleted file mode 100644
index 28f2a90..0000000
--- a/test/CodeGen/X86/vec_splat.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=AVX
-
-define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
-	%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0		; <<4 x float>> [#uses=1]
-	%tmp2 = insertelement <4 x float> %tmp, float %X, i32 1		; <<4 x float>> [#uses=1]
-	%tmp4 = insertelement <4 x float> %tmp2, float %X, i32 2		; <<4 x float>> [#uses=1]
-	%tmp6 = insertelement <4 x float> %tmp4, float %X, i32 3		; <<4 x float>> [#uses=1]
-	%tmp8 = load <4 x float>* %Q		; <<4 x float>> [#uses=1]
-	%tmp10 = fmul <4 x float> %tmp8, %tmp6		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp10, <4 x float>* %P
-	ret void
-
-; SSE2-LABEL: test_v4sf:
-; SSE2: pshufd $0
-
-; SSE3-LABEL: test_v4sf:
-; SSE3: pshufd $0
-}
-
-define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
-	%tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0		; <<2 x double>> [#uses=1]
-	%tmp2 = insertelement <2 x double> %tmp, double %X, i32 1		; <<2 x double>> [#uses=1]
-	%tmp4 = load <2 x double>* %Q		; <<2 x double>> [#uses=1]
-	%tmp6 = fmul <2 x double> %tmp4, %tmp2		; <<2 x double>> [#uses=1]
-	store <2 x double> %tmp6, <2 x double>* %P
-	ret void
-
-; SSE2-LABEL: test_v2sd:
-; SSE2: shufpd $0
-
-; SSE3-LABEL: test_v2sd:
-; SSE3: movddup
-}
-
-; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
-define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
-  %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
-  %2 = load <4 x float>* %1, align 16
-  %3 = trunc i64 %j to i32
-  %4 = extractelement <4 x float> %2, i32 %3
-  %5 = insertelement <4 x float> undef, float %4, i32 0
-  %6 = insertelement <4 x float> %5, float %4, i32 1
-  %7 = insertelement <4 x float> %6, float %4, i32 2
-  %8 = insertelement <4 x float> %7, float %4, i32 3
-  ret <4 x float> %8
-  
-; AVX-LABEL: load_extract_splat
-; AVX-NOT: rsp
-; AVX: vbroadcastss
-}
-
-; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
-define <4 x float> @load_extract_splat1(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
-  %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
-  %2 = load <4 x float>* %1, align 16
-  %3 = extractelement <4 x float> %2, i64 %j
-  %4 = insertelement <4 x float> undef, float %3, i32 0
-  %5 = insertelement <4 x float> %4, float %3, i32 1
-  %6 = insertelement <4 x float> %5, float %3, i32 2
-  %7 = insertelement <4 x float> %6, float %3, i32 3
-  ret <4 x float> %7
-  
-; AVX-LABEL: load_extract_splat1
-; AVX-NOT: movs
-; AVX: vbroadcastss
-}
diff --git a/test/CodeGen/X86/vec_trunc_sext.ll b/test/CodeGen/X86/vec_trunc_sext.ll
new file mode 100644
index 0000000..3c446bb
--- /dev/null
+++ b/test/CodeGen/X86/vec_trunc_sext.ll
@@ -0,0 +1,30 @@
+; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='-sse4.1' -o - | FileCheck %s -check-prefix=NO_SSE_41
+; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='+sse4.1' -o - | FileCheck %s -check-prefix=SSE_41
+
+; PR20472 ( http://llvm.org/bugs/show_bug.cgi?id=20472 )
+; When sexting a trunc'd vector value, we can't eliminate the zext.
+; If we don't have SSE4.1, use punpck.
+; If we have SSE4.1, use pmovzx because it combines the load op.
+; There may be a better way to do this using pshufb + pmovsx,
+; but that is beyond our current codegen capabilities.
+
+define <4 x i32> @trunc_sext(<4 x i16>* %in) {
+  %load = load <4 x i16>* %in
+  %trunc = trunc <4 x i16> %load to <4 x i8>
+  %sext = sext <4 x i8> %trunc to <4 x i32>
+  ret <4 x i32> %sext
+
+; NO_SSE_41-LABEL: trunc_sext:
+; NO_SSE_41: movq (%rdi), %xmm0
+; NO_SSE_41-NEXT: punpcklwd %xmm0, %xmm0
+; NO_SSE_41-NEXT: pslld $24, %xmm0
+; NO_SSE_41-NEXT: psrad $24, %xmm0
+; NO_SSE_41-NEXT: retq
+
+; SSE_41-LABEL: trunc_sext:
+; SSE_41: pmovzxwd (%rdi), %xmm0
+; SSE_41-NEXT: pslld $24, %xmm0
+; SSE_41-NEXT: psrad $24, %xmm0
+; SSE_41-NEXT: retq
+}
+
diff --git a/test/CodeGen/X86/vec_uint_to_fp.ll b/test/CodeGen/X86/vec_uint_to_fp.ll
index ee20f1f..46cfcd9 100644
--- a/test/CodeGen/X86/vec_uint_to_fp.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp.ll
@@ -1,11 +1,167 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck --check-prefix=CHECK --check-prefix=SSE --check-prefix=CST %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse4.1 | FileCheck --check-prefix=CHECK --check-prefix=SSE41 --check-prefix=CST  %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck --check-prefix=CHECK --check-prefix=AVX --check-prefix=CST %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx2 | FileCheck --check-prefix=CHECK --check-prefix=AVX2 %s
+
+; Check that the constant used in the vectors are the right ones.
+; SSE: [[MASKCSTADDR:LCPI0_[0-9]+]]:
+; SSE-NEXT: .long	65535                   ## 0xffff
+; SSE-NEXT: .long	65535                   ## 0xffff
+; SSE-NEXT: .long	65535                   ## 0xffff
+; SSE-NEXT: .long	65535                   ## 0xffff
+
+; CST: [[LOWCSTADDR:LCPI0_[0-9]+]]:
+; CST-NEXT: .long	1258291200              ## 0x4b000000
+; CST-NEXT: .long	1258291200              ## 0x4b000000
+; CST-NEXT: .long	1258291200              ## 0x4b000000
+; CST-NEXT: .long	1258291200              ## 0x4b000000
+
+; CST: [[HIGHCSTADDR:LCPI0_[0-9]+]]:
+; CST-NEXT: .long	1392508928              ## 0x53000000
+; CST-NEXT: .long	1392508928              ## 0x53000000
+; CST-NEXT: .long	1392508928              ## 0x53000000
+; CST-NEXT: .long	1392508928              ## 0x53000000
+
+; CST: [[MAGICCSTADDR:LCPI0_[0-9]+]]:
+; CST-NEXT: .long	3539992704              ## float -5.497642e+11
+; CST-NEXT: .long	3539992704              ## float -5.497642e+11
+; CST-NEXT: .long	3539992704              ## float -5.497642e+11
+; CST-NEXT: .long	3539992704              ## float -5.497642e+11
+
+; AVX2: [[LOWCSTADDR:LCPI0_[0-9]+]]:
+; AVX2-NEXT: .long	1258291200              ## 0x4b000000
+
+; AVX2: [[HIGHCSTADDR:LCPI0_[0-9]+]]:
+; AVX2-NEXT: .long	1392508928              ## 0x53000000
+
+; AVX2: [[MAGICCSTADDR:LCPI0_[0-9]+]]:
+; AVX2-NEXT: .long	3539992704              ## float -5.49764202E+11
 
-; Test that we are not lowering uinttofp to scalars
 define <4 x float> @test1(<4 x i32> %A) nounwind {
 ; CHECK-LABEL: test1:
-; CHECK-NOT: cvtsd2ss
-; CHECK: ret
+;
+; SSE: movdqa [[MASKCSTADDR]](%rip), [[MASK:%xmm[0-9]+]]
+; SSE-NEXT: pand %xmm0, [[MASK]]
+; After this instruction, MASK will have the value of the low parts
+; of the vector.
+; SSE-NEXT: por [[LOWCSTADDR]](%rip), [[MASK]]
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: por [[HIGHCSTADDR]](%rip), %xmm0
+; SSE-NEXT: addps [[MAGICCSTADDR]](%rip), %xmm0
+; SSE-NEXT: addps [[MASK]], %xmm0
+; SSE-NEXT: retq
+;
+; Currently we commute the arguments of the first blend, but this could be
+; improved to match the lowering of the second blend.
+; SSE41: movdqa [[LOWCSTADDR]](%rip), [[LOWVEC:%xmm[0-9]+]]
+; SSE41-NEXT: pblendw $85, %xmm0, [[LOWVEC]]
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pblendw $170, [[HIGHCSTADDR]](%rip), %xmm0
+; SSE41-NEXT: addps [[MAGICCSTADDR]](%rip), %xmm0
+; SSE41-NEXT: addps [[LOWVEC]], %xmm0
+; SSE41-NEXT: retq
+;
+; AVX: vpblendw $170, [[LOWCSTADDR]](%rip), %xmm0, [[LOWVEC:%xmm[0-9]+]]
+; AVX-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]]
+; AVX-NEXT: vpblendw $170, [[HIGHCSTADDR]](%rip), [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]]
+; AVX-NEXT: vaddps [[MAGICCSTADDR]](%rip), [[HIGHVEC]], [[TMP:%xmm[0-9]+]]
+; AVX-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0
+; AVX-NEXT: retq
+;
+; The lowering for AVX2 is a bit messy, because we select broadcast
+; instructions, instead of folding the constant loads.
+; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%xmm[0-9]+]]
+; AVX2-NEXT: vpblendw $170, [[LOWCST]], %xmm0, [[LOWVEC:%xmm[0-9]+]]
+; AVX2-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]]
+; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%xmm[0-9]+]]
+; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]]
+; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%xmm[0-9]+]]
+; AVX2-NEXT: vaddps [[MAGICCST]], [[HIGHVEC]], [[TMP:%xmm[0-9]+]]
+; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0
+; AVX2-NEXT: retq
   %C = uitofp <4 x i32> %A to <4 x float>
   ret <4 x float> %C
 }
 
+; Match the AVX2 constants used in the next function
+; AVX2: [[LOWCSTADDR:LCPI1_[0-9]+]]:
+; AVX2-NEXT: .long	1258291200              ## 0x4b000000
+
+; AVX2: [[HIGHCSTADDR:LCPI1_[0-9]+]]:
+; AVX2-NEXT: .long	1392508928              ## 0x53000000
+
+; AVX2: [[MAGICCSTADDR:LCPI1_[0-9]+]]:
+; AVX2-NEXT: .long	3539992704              ## float -5.49764202E+11
+
+define <8 x float> @test2(<8 x i32> %A) nounwind {
+; CHECK-LABEL: test2:
+; Legalization will break the thing is 2 x <4 x i32> on anthing prior AVX.
+; The constant used for in the vector instruction are shared between the
+; two sequences of instructions.
+;
+; SSE: movdqa {{.*#+}} [[MASK:xmm[0-9]+]] = [65535,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]]
+; SSE-NEXT: pand %[[MASK]], [[VECLOW]]
+; SSE-NEXT: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200]
+; SSE-NEXT: por %[[LOWCST]], [[VECLOW]]
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928]
+; SSE-NEXT: por %[[HIGHCST]], %xmm0
+; SSE-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
+; SSE-NEXT: addps %[[MAGICCST]], %xmm0
+; SSE-NEXT: addps [[VECLOW]], %xmm0
+; MASK is the low vector of the second part after this point.
+; SSE-NEXT: pand %xmm1, %[[MASK]]
+; SSE-NEXT: por %[[LOWCST]], %[[MASK]]
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: por %[[HIGHCST]], %xmm1
+; SSE-NEXT: addps %[[MAGICCST]], %xmm1
+; SSE-NEXT: addps %[[MASK]], %xmm1
+; SSE-NEXT: retq
+;
+; SSE41: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200]
+; SSE41-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]]
+; SSE41-NEXT: pblendw $170, %[[LOWCST]], [[VECLOW]]
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928]
+; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm0
+; SSE41-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
+; SSE41-NEXT: addps %[[MAGICCST]], %xmm0
+; SSE41-NEXT: addps [[VECLOW]], %xmm0
+; LOWCST is the low vector of the second part after this point.
+; The operands of the blend are inverted because we reuse xmm1
+; in the next shift.
+; SSE41-NEXT: pblendw $85, %xmm1, %[[LOWCST]]
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm1
+; SSE41-NEXT: addps %[[MAGICCST]], %xmm1
+; SSE41-NEXT: addps %[[LOWCST]], %xmm1
+; SSE41-NEXT: retq
+;
+; Test that we are not lowering uinttofp to scalars
+; AVX-NOT: cvtsd2ss
+; AVX: retq
+;
+; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%ymm[0-9]+]]
+; AVX2-NEXT: vpblendw $170, [[LOWCST]], %ymm0, [[LOWVEC:%ymm[0-9]+]]
+; AVX2-NEXT: vpsrld $16, %ymm0, [[SHIFTVEC:%ymm[0-9]+]]
+; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%ymm[0-9]+]]
+; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%ymm[0-9]+]]
+; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%ymm[0-9]+]]
+; AVX2-NEXT: vaddps [[MAGICCST]], [[HIGHVEC]], [[TMP:%ymm[0-9]+]]
+; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %ymm0
+; AVX2-NEXT: retq
+  %C = uitofp <8 x i32> %A to <8 x float>
+  ret <8 x float> %C
+}
+
+define <4 x double> @test3(<4 x i32> %arg) {
+; CHECK-LABEL: test3:
+; This test used to crash because we were custom lowering it as if it was
+; a conversion between <4 x i32> and <4 x float>.
+; AVX: vcvtdq2pd
+; AVX2: vcvtdq2pd
+; CHECK: retq
+  %tmp = uitofp <4 x i32> %arg to <4 x double>
+  ret <4 x double> %tmp
+}
diff --git a/test/CodeGen/X86/vec_unsafe-fp-math.ll b/test/CodeGen/X86/vec_unsafe-fp-math.ll
new file mode 100644
index 0000000..827d418
--- /dev/null
+++ b/test/CodeGen/X86/vec_unsafe-fp-math.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -enable-unsafe-fp-math -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s
+
+; Make sure that vectors get the same benefits as scalars when using unsafe-fp-math.
+
+; Subtracting zero is free.
+define <4 x float> @vec_fsub_zero(<4 x float> %x) {
+; CHECK-LABEL: vec_fsub_zero:
+; CHECK-NOT: subps
+; CHECK-NOT: xorps
+; CHECK: retq
+  %sub = fsub <4 x float> %x, zeroinitializer
+  ret <4 x float> %sub
+}
+
+; Negating doesn't require subtraction.
+define <4 x float> @vec_fneg(<4 x float> %x) {
+; CHECK-LABEL: vec_fneg:
+; CHECK: xorps  {{.*}}LCP{{.*}}, %xmm0
+; CHECK-NOT: subps
+; CHECK-NEXT: retq
+  %sub = fsub <4 x float> zeroinitializer, %x
+  ret <4 x float> %sub
+}
diff --git a/test/CodeGen/X86/vec_zext.ll b/test/CodeGen/X86/vec_zext.ll
deleted file mode 100644
index 615a50b..0000000
--- a/test/CodeGen/X86/vec_zext.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -march=x86-64
-; PR 9267
-
-define<4 x i32> @func_16_32() {
-  %F = load <4 x i16>* undef
-  %G = zext <4 x i16> %F to <4 x i32>
-  %H = load <4 x i16>* undef
-  %Y = zext <4 x i16> %H to <4 x i32>
-  %T = add <4 x i32> %Y, %G
-  store <4 x i32>%T , <4 x i32>* undef
-  ret <4 x i32> %T
-}
-
-define<4 x i64> @func_16_64() {
-  %F = load <4 x i16>* undef
-  %G = zext <4 x i16> %F to <4 x i64>
-  %H = load <4 x i16>* undef
-  %Y = zext <4 x i16> %H to <4 x i64>
-  %T = xor <4 x i64> %Y, %G
-  store <4 x i64>%T , <4 x i64>* undef
-  ret <4 x i64> %T
-}
-
-define<4 x i64> @func_32_64() {
-  %F = load <4 x i32>* undef
-  %G = zext <4 x i32> %F to <4 x i64>
-  %H = load <4 x i32>* undef
-  %Y = zext <4 x i32> %H to <4 x i64>
-  %T = or <4 x i64> %Y, %G
-  ret <4 x i64> %T
-}
-
-define<4 x i16> @func_8_16() {
-  %F = load <4 x i8>* undef
-  %G = zext <4 x i8> %F to <4 x i16>
-  %H = load <4 x i8>* undef
-  %Y = zext <4 x i8> %H to <4 x i16>
-  %T = add <4 x i16> %Y, %G
-  ret <4 x i16> %T
-}
-
-define<4 x i32> @func_8_32() {
-  %F = load <4 x i8>* undef
-  %G = zext <4 x i8> %F to <4 x i32>
-  %H = load <4 x i8>* undef
-  %Y = zext <4 x i8> %H to <4 x i32>
-  %T = sub <4 x i32> %Y, %G
-  ret <4 x i32> %T
-}
-
-define<4 x i64> @func_8_64() {
-  %F = load <4 x i8>* undef
-  %G = zext <4 x i8> %F to <4 x i64>
-  %H = load <4 x i8>* undef
-  %Y = zext <4 x i8> %H to <4 x i64>
-  %T = add <4 x i64> %Y, %G
-  ret <4 x i64> %T
-}
-
-define<4 x i32> @const_16_32() {
-  %G = zext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i32>
-  ret <4 x i32> %G
-}
-
-define<4 x i64> @const_16_64() {
-  %G = zext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i64>
-  ret <4 x i64> %G
-}
-
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
new file mode 100644
index 0000000..0a3ed7e
--- /dev/null
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -0,0 +1,708 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+; AVX128 tests:
+
+define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
+; SSE2-LABEL: vsel_float:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_float:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_float:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vsel_float:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
+  ret <4 x float> %vsel
+}
+
+define <4 x float> @vsel_float2(<4 x float> %v1, <4 x float> %v2) {
+; SSE-LABEL: vsel_float2:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vsel_float2:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
+  ret <4 x float> %vsel
+}
+
+define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
+; SSE2-LABEL: vsel_4xi8:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_4xi8:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_4xi8:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: vsel_4xi8:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_4xi8:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-NEXT:    retq
+entry:
+  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
+  ret <4 x i8> %vsel
+}
+
+define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
+; SSE2-LABEL: vsel_4xi16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_4xi16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_4xi16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: vsel_4xi16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_4xi16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    retq
+entry:
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
+  ret <4 x i16> %vsel
+}
+
+define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
+; SSE2-LABEL: vsel_i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: vsel_i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    retq
+entry:
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
+  ret <4 x i32> %vsel
+}
+
+define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
+; SSE-LABEL: vsel_double:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vsel_double:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
+  ret <2 x double> %vsel
+}
+
+define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) {
+; SSE-LABEL: vsel_i64:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vsel_i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2
+  ret <2 x i64> %vsel
+}
+
+define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
+; SSE2-LABEL: vsel_8xi16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_8xi16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_8xi16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vsel_8xi16:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
+  ret <8 x i16> %vsel
+}
+
+define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
+; SSE2-LABEL: vsel_i8:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_i8:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_i8:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vsel_i8:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
+  ret <16 x i8> %vsel
+}
+
+
+; AVX256 tests:
+
+define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
+; SSE-LABEL: vsel_float8:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movss %xmm0, %xmm2
+; SSE-NEXT:    movss %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vsel_float8:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
+  ret <8 x float> %vsel
+}
+
+define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
+; SSE-LABEL: vsel_i328:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movss %xmm0, %xmm2
+; SSE-NEXT:    movss %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vsel_i328:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_i328:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    retq
+entry:
+  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
+  ret <8 x i32> %vsel
+}
+
+define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
+; SSE2-LABEL: vsel_double8:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd %xmm0, %xmm4
+; SSE2-NEXT:    movsd %xmm2, %xmm6
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm5, %xmm1
+; SSE2-NEXT:    movaps %xmm6, %xmm2
+; SSE2-NEXT:    movaps %xmm7, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_double8:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd %xmm0, %xmm4
+; SSSE3-NEXT:    movsd %xmm2, %xmm6
+; SSSE3-NEXT:    movaps %xmm4, %xmm0
+; SSSE3-NEXT:    movaps %xmm5, %xmm1
+; SSSE3-NEXT:    movaps %xmm6, %xmm2
+; SSSE3-NEXT:    movaps %xmm7, %xmm3
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_double8:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1]
+; SSE41-NEXT:    blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1]
+; SSE41-NEXT:    movaps %xmm5, %xmm1
+; SSE41-NEXT:    movaps %xmm7, %xmm3
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vsel_double8:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3]
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
+  ret <8 x double> %vsel
+}
+
+define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
+; SSE2-LABEL: vsel_i648:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd %xmm0, %xmm4
+; SSE2-NEXT:    movsd %xmm2, %xmm6
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm5, %xmm1
+; SSE2-NEXT:    movaps %xmm6, %xmm2
+; SSE2-NEXT:    movaps %xmm7, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_i648:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd %xmm0, %xmm4
+; SSSE3-NEXT:    movsd %xmm2, %xmm6
+; SSSE3-NEXT:    movaps %xmm4, %xmm0
+; SSSE3-NEXT:    movaps %xmm5, %xmm1
+; SSSE3-NEXT:    movaps %xmm6, %xmm2
+; SSSE3-NEXT:    movaps %xmm7, %xmm3
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_i648:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT:    movaps %xmm5, %xmm1
+; SSE41-NEXT:    movaps %xmm7, %xmm3
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: vsel_i648:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_i648:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+entry:
+  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
+  ret <8 x i64> %vsel
+}
+
+define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
+; SSE-LABEL: vsel_double4:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movsd %xmm0, %xmm2
+; SSE-NEXT:    movsd %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vsel_double4:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
+  ret <4 x double> %vsel
+}
+
+define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
+; SSE2-LABEL: testa:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    cmplepd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm2, %xmm0
+; SSE2-NEXT:    andnpd %xmm1, %xmm2
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: testa:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movapd %xmm1, %xmm2
+; SSSE3-NEXT:    cmplepd %xmm0, %xmm2
+; SSSE3-NEXT:    andpd %xmm2, %xmm0
+; SSSE3-NEXT:    andnpd %xmm1, %xmm2
+; SSSE3-NEXT:    orpd %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: testa:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movapd %xmm0, %xmm2
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    cmplepd %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: testa:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vcmplepd %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %max_is_x = fcmp oge <2 x double> %x, %y
+  %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %max
+}
+
+define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
+; SSE2-LABEL: testb:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    cmpnlepd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm2, %xmm0
+; SSE2-NEXT:    andnpd %xmm1, %xmm2
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: testb:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movapd %xmm1, %xmm2
+; SSSE3-NEXT:    cmpnlepd %xmm0, %xmm2
+; SSSE3-NEXT:    andpd %xmm2, %xmm0
+; SSSE3-NEXT:    andnpd %xmm1, %xmm2
+; SSSE3-NEXT:    orpd %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: testb:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movapd %xmm0, %xmm2
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    cmpnlepd %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: testb:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vcmpnlepd %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %min_is_x = fcmp ult <2 x double> %x, %y
+  %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %min
+}
+
+; If we can figure out a blend has a constant mask, we should emit the
+; blend instruction with an immediate mask
+define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
+; SSE-LABEL: constant_blendvpd_avx:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movsd %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: constant_blendvpd_avx:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; AVX-NEXT:    retq
+entry:
+  %select = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
+  ret <4 x double> %select
+}
+
+define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
+; SSE2-LABEL: constant_blendvps_avx:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0]
+; SSE2-NEXT:    andps %xmm4, %xmm2
+; SSE2-NEXT:    movaps {{.*#+}} xmm5 = [0,0,0,4294967295]
+; SSE2-NEXT:    andps %xmm5, %xmm0
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm4, %xmm3
+; SSE2-NEXT:    andps %xmm5, %xmm1
+; SSE2-NEXT:    orps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: constant_blendvps_avx:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0]
+; SSSE3-NEXT:    andps %xmm4, %xmm2
+; SSSE3-NEXT:    movaps {{.*#+}} xmm5 = [0,0,0,4294967295]
+; SSSE3-NEXT:    andps %xmm5, %xmm0
+; SSSE3-NEXT:    orps %xmm2, %xmm0
+; SSSE3-NEXT:    andps %xmm4, %xmm3
+; SSSE3-NEXT:    andps %xmm5, %xmm1
+; SSSE3-NEXT:    orps %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: constant_blendvps_avx:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_blendvps_avx:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
+; AVX-NEXT:    retq
+entry:
+  %select = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
+  ret <8 x float> %select
+}
+
+define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; SSE2-LABEL: constant_pblendvb_avx2:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSE2-NEXT:    andps %xmm4, %xmm2
+; SSE2-NEXT:    movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSE2-NEXT:    andps %xmm5, %xmm0
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm4, %xmm3
+; SSE2-NEXT:    andps %xmm5, %xmm1
+; SSE2-NEXT:    orps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: constant_pblendvb_avx2:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSSE3-NEXT:    andps %xmm4, %xmm2
+; SSSE3-NEXT:    movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSSE3-NEXT:    andps %xmm5, %xmm0
+; SSSE3-NEXT:    orps %xmm2, %xmm0
+; SSSE3-NEXT:    andps %xmm4, %xmm3
+; SSSE3-NEXT:    andps %xmm5, %xmm1
+; SSSE3-NEXT:    orps %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: constant_pblendvb_avx2:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_pblendvb_avx2:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_pblendvb_avx2:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %select = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
+  ret <32 x i8> %select
+}
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+;; 4 tests for shufflevectors that optimize to blend + immediate
+define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: blend_shufflevector_4xfloat:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: blend_shufflevector_4xfloat:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: blend_shufflevector_4xfloat:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: blend_shufflevector_4xfloat:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX-NEXT:    retq
+entry:
+  %select = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %select
+}
+
+define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: blend_shufflevector_8xfloat:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movss %xmm0, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: blend_shufflevector_8xfloat:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movss %xmm0, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    movaps %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: blend_shufflevector_8xfloat:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: blend_shufflevector_8xfloat:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7]
+; AVX-NEXT:    retq
+entry:
+  %select = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15>
+  ret <8 x float> %select
+}
+
+define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: blend_shufflevector_4xdouble:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: blend_shufflevector_4xdouble:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd %xmm0, %xmm2
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: blend_shufflevector_4xdouble:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: blend_shufflevector_4xdouble:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX-NEXT:    retq
+entry:
+  %select = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x double> %select
+}
+
+define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: blend_shufflevector_4xi64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: blend_shufflevector_4xi64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd %xmm2, %xmm0
+; SSSE3-NEXT:    movaps %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: blend_shufflevector_4xi64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    movaps %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: blend_shufflevector_4xi64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: blend_shufflevector_4xi64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+entry:
+  %select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  ret <4 x i64> %select
+}
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index b6d43e9..4b269dc 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -1,221 +1,1255 @@
-; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=SSE41
-; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE
-; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX
+; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=SSE41
+; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX
 
-define <4 x i32> @test1(<4 x i32> %a) {
-  %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
-  ret <4 x i32> %div
+target triple = "x86_64-unknown-unknown"
 
+define <4 x i32> @test1(<4 x i32> %a) {
 ; SSE41-LABEL: test1:
-; SSE41: pmuludq
-; SSE41: pshufd	$49
-; SSE41: pmuludq
-; SSE41: shufps	$-35
-; SSE41: psubd
-; SSE41: psrld $1
-; SSE41: padd
-; SSE41: psrld $2
-
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pmuludq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm1, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT:    psubd %xmm2, %xmm0
+; SSE41-NEXT:    psrld $1, %xmm0
+; SSE41-NEXT:    paddd %xmm2, %xmm0
+; SSE41-NEXT:    psrld $2, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test1:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm1, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    psubd %xmm2, %xmm0
+; SSE-NEXT:    psrld $1, %xmm0
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    psrld $2, %xmm0
+; SSE-NEXT:    retq
+;
 ; AVX-LABEL: test1:
-; AVX: vpmuludq
-; AVX: vpshufd	$49
-; AVX: vpmuludq
-; AVX: vshufps	$-35
-; AVX: vpsubd
-; AVX: vpsrld $1
-; AVX: vpadd
-; AVX: vpsrld $2
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %div
 }
 
 define <8 x i32> @test2(<8 x i32> %a) {
+; SSE41-LABEL: test2:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm4, %xmm5
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE41-NEXT:    psubd %xmm3, %xmm0
+; SSE41-NEXT:    psrld $1, %xmm0
+; SSE41-NEXT:    paddd %xmm3, %xmm0
+; SSE41-NEXT:    psrld $2, %xmm0
+; SSE41-NEXT:    pmuludq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm4, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT:    psubd %xmm2, %xmm1
+; SSE41-NEXT:    psrld $1, %xmm1
+; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    psrld $2, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test2:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pmuludq %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE-NEXT:    psubd %xmm3, %xmm0
+; SSE-NEXT:    psrld $1, %xmm0
+; SSE-NEXT:    paddd %xmm3, %xmm0
+; SSE-NEXT:    psrld $2, %xmm0
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    psubd %xmm2, %xmm1
+; SSE-NEXT:    psrld $1, %xmm1
+; SSE-NEXT:    paddd %xmm2, %xmm1
+; SSE-NEXT:    psrld $2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
+; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpsrld $2, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
   ret <8 x i32> %div
-
-; AVX-LABEL: test2:
-; AVX: vpbroadcastd
-; AVX: vpalignr $4
-; AVX: vpmuludq
-; AVX: vpmuludq
-; AVX: vpblendd $170
-; AVX: vpsubd
-; AVX: vpsrld $1
-; AVX: vpadd
-; AVX: vpsrld $2
 }
 
 define <8 x i16> @test3(<8 x i16> %a) {
-  %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
-  ret <8 x i16> %div
-
 ; SSE41-LABEL: test3:
-; SSE41: pmulhuw
-; SSE41: psubw
-; SSE41: psrlw $1
-; SSE41: paddw
-; SSE41: psrlw $2
-
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
+; SSE41-NEXT:    psubw %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $1, %xmm0
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $2, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test3:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE-NEXT:    pmulhuw %xmm0, %xmm1
+; SSE-NEXT:    psubw %xmm1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    psrlw $2, %xmm0
+; SSE-NEXT:    retq
+;
 ; AVX-LABEL: test3:
-; AVX: vpmulhuw
-; AVX: vpsubw
-; AVX: vpsrlw $1
-; AVX: vpaddw
-; AVX: vpsrlw $2
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %div
 }
 
 define <16 x i16> @test4(<16 x i16> %a) {
+; SSE41-LABEL: test4:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pmulhuw %xmm2, %xmm3
+; SSE41-NEXT:    psubw %xmm3, %xmm0
+; SSE41-NEXT:    psrlw $1, %xmm0
+; SSE41-NEXT:    paddw %xmm3, %xmm0
+; SSE41-NEXT:    psrlw $2, %xmm0
+; SSE41-NEXT:    pmulhuw %xmm1, %xmm2
+; SSE41-NEXT:    psubw %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $1, %xmm1
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $2, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test4:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pmulhuw %xmm2, %xmm3
+; SSE-NEXT:    psubw %xmm3, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    paddw %xmm3, %xmm0
+; SSE-NEXT:    psrlw $2, %xmm0
+; SSE-NEXT:    pmulhuw %xmm1, %xmm2
+; SSE-NEXT:    psubw %xmm2, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    paddw %xmm2, %xmm1
+; SSE-NEXT:    psrlw $2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpsrlw $2, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
   ret <16 x i16> %div
-
-; AVX-LABEL: test4:
-; AVX: vpmulhuw
-; AVX: vpsubw
-; AVX: vpsrlw $1
-; AVX: vpaddw
-; AVX: vpsrlw $2
-; AVX-NOT: vpmulhuw
 }
 
 define <8 x i16> @test5(<8 x i16> %a) {
-  %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
-  ret <8 x i16> %div
-
 ; SSE41-LABEL: test5:
-; SSE41: pmulhw
-; SSE41: psrlw $15
-; SSE41: psraw $1
-; SSE41: paddw
-
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmulhw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $15, %xmm1
+; SSE41-NEXT:    psraw $1, %xmm0
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test5:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmulhw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $15, %xmm1
+; SSE-NEXT:    psraw $1, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
 ; AVX-LABEL: test5:
-; AVX: vpmulhw
-; AVX: vpsrlw $15
-; AVX: vpsraw $1
-; AVX: vpaddw
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm1
+; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %div
 }
 
 define <16 x i16> @test6(<16 x i16> %a) {
+; SSE41-LABEL: test6:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; SSE41-NEXT:    pmulhw %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrlw $15, %xmm3
+; SSE41-NEXT:    psraw $1, %xmm0
+; SSE41-NEXT:    paddw %xmm3, %xmm0
+; SSE41-NEXT:    pmulhw %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $15, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm1
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test6:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; SSE-NEXT:    pmulhw %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrlw $15, %xmm3
+; SSE-NEXT:    psraw $1, %xmm0
+; SSE-NEXT:    paddw %xmm3, %xmm0
+; SSE-NEXT:    pmulhw %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrlw $15, %xmm2
+; SSE-NEXT:    psraw $1, %xmm1
+; SSE-NEXT:    paddw %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmulhw {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    vpsrlw $15, %ymm0, %ymm1
+; AVX-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
   ret <16 x i16> %div
-
-; AVX-LABEL: test6:
-; AVX: vpmulhw
-; AVX: vpsrlw $15
-; AVX: vpsraw $1
-; AVX: vpaddw
-; AVX-NOT: vpmulhw
 }
 
 define <16 x i8> @test7(<16 x i8> %a) {
-  %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
-  ret <16 x i8> %div
-
-; FIXME: scalarized
 ; SSE41-LABEL: test7:
-; SSE41: pext
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pextrb $1, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pextrb $0, %xmm0, %ecx
+; SSE41-NEXT:    movsbl %cl, %ecx
+; SSE41-NEXT:    imull $-109, %ecx, %edx
+; SSE41-NEXT:    shrl $8, %edx
+; SSE41-NEXT:    addb %dl, %cl
+; SSE41-NEXT:    movb %cl, %dl
+; SSE41-NEXT:    shrb $7, %dl
+; SSE41-NEXT:    sarb $2, %cl
+; SSE41-NEXT:    addb %dl, %cl
+; SSE41-NEXT:    movzbl %cl, %ecx
+; SSE41-NEXT:    movd %ecx, %xmm1
+; SSE41-NEXT:    pinsrb $1, %eax, %xmm1
+; SSE41-NEXT:    pextrb $2, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $2, %eax, %xmm1
+; SSE41-NEXT:    pextrb $3, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $3, %eax, %xmm1
+; SSE41-NEXT:    pextrb $4, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $4, %eax, %xmm1
+; SSE41-NEXT:    pextrb $5, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $5, %eax, %xmm1
+; SSE41-NEXT:    pextrb $6, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $6, %eax, %xmm1
+; SSE41-NEXT:    pextrb $7, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $7, %eax, %xmm1
+; SSE41-NEXT:    pextrb $8, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $8, %eax, %xmm1
+; SSE41-NEXT:    pextrb $9, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $9, %eax, %xmm1
+; SSE41-NEXT:    pextrb $10, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $10, %eax, %xmm1
+; SSE41-NEXT:    pextrb $11, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $11, %eax, %xmm1
+; SSE41-NEXT:    pextrb $12, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $12, %eax, %xmm1
+; SSE41-NEXT:    pextrb $13, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $13, %eax, %xmm1
+; SSE41-NEXT:    pextrb $14, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $14, %eax, %xmm1
+; SSE41-NEXT:    pextrb $15, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test7:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm4
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    retq
+;
 ; AVX-LABEL: test7:
-; AVX: pext
+; AVX:       # BB#0:
+; AVX-NEXT:    vpextrb $1, %xmm0, %eax
+; AVX-NEXT:    movsbl %al, %eax
+; AVX-NEXT:    imull $-109, %eax, %ecx
+; AVX-NEXT:    shrl $8, %ecx
+; AVX-NEXT:    addb %cl, %al
+; AVX-NEXT:    movb %al, %cl
+; AVX-NEXT:    shrb $7, %cl
+; AVX-NEXT:    sarb $2, %al
+; AVX-NEXT:    addb %cl, %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    vpextrb $0, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %dl
+; AVX-NEXT:    shrb $7, %dl
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movzbl %cl, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vpextrb $2, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $3, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $4, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $5, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $6, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $7, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $8, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $9, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $10, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $11, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $12, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $13, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $14, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $15, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm0
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %div
 }
 
 define <4 x i32> @test8(<4 x i32> %a) {
-  %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
-  ret <4 x i32> %div
-
 ; SSE41-LABEL: test8:
-; SSE41: pmuldq
-; SSE41: pshufd	$49
-; SSE41-NOT: pshufd	$49
-; SSE41: pmuldq
-; SSE41: shufps	$-35
-; SSE41: pshufd	$-40
-; SSE41: padd
-; SSE41: psrld $31
-; SSE41: psrad $2
-; SSE41: padd
-
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pmuldq %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm2, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $31, %xmm0
+; SSE41-NEXT:    psrad $2, %xmm1
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
 ; SSE-LABEL: test8:
-; SSE: psrad $31
-; SSE: pand
-; SSE: paddd
-; SSE: pmuludq
-; SSE: pshufd	$49
-; SSE-NOT: pshufd	$49
-; SSE: pmuludq
-; SSE: shufps	$-35
-; SSE: pshufd	$-40
-; SSE: psubd
-; SSE: padd
-; SSE: psrld $31
-; SSE: psrad $2
-; SSE: padd
-
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrad $31, %xmm3
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    paddd %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pmuludq %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm4
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT:    psubd %xmm3, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $31, %xmm0
+; SSE-NEXT:    psrad $2, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
 ; AVX-LABEL: test8:
-; AVX: vpmuldq
-; AVX: vpshufd	$49
-; AVX-NOT: vpshufd	$49
-; AVX: vpmuldq
-; AVX: vshufps	$-35
-; AVX: vpshufd	$-40
-; AVX: vpadd
-; AVX: vpsrld $31
-; AVX: vpsrad $2
-; AVX: vpadd
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-NEXT:    vpmuldq %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrld $31, %xmm0, %xmm1
+; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %div
 }
 
 define <8 x i32> @test9(<8 x i32> %a) {
+; SSE41-LABEL: test9:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT:   # kill: XMM0<def> XMM3<kill>
+; SSE41-NEXT:    pmuldq %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm4, %xmm5
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm5[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE41-NEXT:    paddd %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld $31, %xmm3
+; SSE41-NEXT:    psrad $2, %xmm0
+; SSE41-NEXT:    paddd %xmm3, %xmm0
+; SSE41-NEXT:    pmuldq %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm4, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrld $31, %xmm2
+; SSE41-NEXT:    psrad $2, %xmm1
+; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test9:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    psrad $31, %xmm4
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm3, %xmm5
+; SSE-NEXT:    psrad $31, %xmm5
+; SSE-NEXT:    pand %xmm1, %xmm5
+; SSE-NEXT:    paddd %xmm0, %xmm5
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm6, %xmm7
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm7[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    psubd %xmm5, %xmm0
+; SSE-NEXT:    paddd %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrld $31, %xmm3
+; SSE-NEXT:    psrad $2, %xmm0
+; SSE-NEXT:    paddd %xmm3, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm4
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    psrad $31, %xmm3
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    paddd %xmm4, %xmm3
+; SSE-NEXT:    pmuludq %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm6, %xmm4
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT:    psubd %xmm3, %xmm1
+; SSE-NEXT:    paddd %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrld $31, %xmm2
+; SSE-NEXT:    psrad $2, %xmm1
+; SSE-NEXT:    paddd %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test9:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
+; AVX-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vpsrld $31, %ymm0, %ymm1
+; AVX-NEXT:    vpsrad $2, %ymm0, %ymm0
+; AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
   ret <8 x i32> %div
-
-; AVX-LABEL: test9:
-; AVX: vpalignr $4
-; AVX: vpbroadcastd
-; AVX: vpmuldq
-; AVX: vpmuldq
-; AVX: vpblendd $170
-; AVX: vpadd
-; AVX: vpsrld $31
-; AVX: vpsrad $2
-; AVX: vpadd
 }
 
 define <8 x i32> @test10(<8 x i32> %a) {
+; SSE41-LABEL: test10:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm4, %xmm5
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psubd %xmm3, %xmm5
+; SSE41-NEXT:    psrld $1, %xmm5
+; SSE41-NEXT:    paddd %xmm3, %xmm5
+; SSE41-NEXT:    psrld $2, %xmm5
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7]
+; SSE41-NEXT:    pmulld %xmm3, %xmm5
+; SSE41-NEXT:    psubd %xmm5, %xmm0
+; SSE41-NEXT:    pmuludq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm4, %xmm5
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psubd %xmm2, %xmm4
+; SSE41-NEXT:    psrld $1, %xmm4
+; SSE41-NEXT:    paddd %xmm2, %xmm4
+; SSE41-NEXT:    psrld $2, %xmm4
+; SSE41-NEXT:    pmulld %xmm3, %xmm4
+; SSE41-NEXT:    psubd %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test10:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pmuludq %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    psubd %xmm3, %xmm5
+; SSE-NEXT:    psrld $1, %xmm5
+; SSE-NEXT:    paddd %xmm3, %xmm5
+; SSE-NEXT:    psrld $2, %xmm5
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm3, %xmm5
+; SSE-NEXT:    pmuludq %xmm3, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2,1,3]
+; SSE-NEXT:    psubd %xmm5, %xmm0
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    psubd %xmm2, %xmm4
+; SSE-NEXT:    psrld $1, %xmm4
+; SSE-NEXT:    paddd %xmm2, %xmm4
+; SSE-NEXT:    psrld $2, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm3, %xmm4
+; SSE-NEXT:    pmuludq %xmm3, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
+; SSE-NEXT:    psubd %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test10:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
+; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX-NEXT:    vpsubd %ymm1, %ymm0, %ymm2
+; AVX-NEXT:    vpsrld $1, %ymm2, %ymm2
+; AVX-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
+; AVX-NEXT:    vpsrld $2, %ymm1, %ymm1
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
   ret <8 x i32> %rem
-
-; AVX-LABEL: test10:
-; AVX: vpbroadcastd
-; AVX: vpalignr $4
-; AVX: vpmuludq
-; AVX: vpmuludq
-; AVX: vpblendd $170
-; AVX: vpsubd
-; AVX: vpsrld $1
-; AVX: vpadd
-; AVX: vpsrld $2
-; AVX: vpmulld
 }
 
 define <8 x i32> @test11(<8 x i32> %a) {
+; SSE41-LABEL: test11:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pmuldq %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm4, %xmm5
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE41-NEXT:    paddd %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    psrld $31, %xmm5
+; SSE41-NEXT:    psrad $2, %xmm3
+; SSE41-NEXT:    paddd %xmm5, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [7,7,7,7]
+; SSE41-NEXT:    pmulld %xmm5, %xmm3
+; SSE41-NEXT:    psubd %xmm3, %xmm0
+; SSE41-NEXT:    pmuldq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm4, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT:    paddd %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrld $31, %xmm3
+; SSE41-NEXT:    psrad $2, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm2
+; SSE41-NEXT:    pmulld %xmm5, %xmm2
+; SSE41-NEXT:    psubd %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test11:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    psrad $31, %xmm3
+; SSE-NEXT:    movdqa %xmm3, %xmm4
+; SSE-NEXT:    pand %xmm0, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm6
+; SSE-NEXT:    psrad $31, %xmm6
+; SSE-NEXT:    pand %xmm2, %xmm6
+; SSE-NEXT:    paddd %xmm4, %xmm6
+; SSE-NEXT:    movdqa %xmm0, %xmm7
+; SSE-NEXT:    pmuludq %xmm2, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm5, %xmm4
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,3],xmm4[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT:    psubd %xmm6, %xmm7
+; SSE-NEXT:    paddd %xmm0, %xmm7
+; SSE-NEXT:    movdqa %xmm7, %xmm4
+; SSE-NEXT:    psrld $31, %xmm4
+; SSE-NEXT:    psrad $2, %xmm7
+; SSE-NEXT:    paddd %xmm4, %xmm7
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm7
+; SSE-NEXT:    pmuludq %xmm4, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT:    psubd %xmm7, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm6
+; SSE-NEXT:    psrad $31, %xmm6
+; SSE-NEXT:    pand %xmm2, %xmm6
+; SSE-NEXT:    paddd %xmm3, %xmm6
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm5, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    psubd %xmm6, %xmm2
+; SSE-NEXT:    paddd %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    psrld $31, %xmm3
+; SSE-NEXT:    psrad $2, %xmm2
+; SSE-NEXT:    paddd %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm2
+; SSE-NEXT:    pmuludq %xmm4, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    psubd %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test11:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
+; AVX-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
+; AVX-NEXT:    vpsrld $31, %ymm1, %ymm2
+; AVX-NEXT:    vpsrad $2, %ymm1, %ymm1
+; AVX-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
   ret <8 x i32> %rem
-
-; AVX-LABEL: test11:
-; AVX: vpalignr $4
-; AVX: vpbroadcastd
-; AVX: vpmuldq
-; AVX: vpmuldq
-; AVX: vpblendd $170
-; AVX: vpadd
-; AVX: vpsrld $31
-; AVX: vpsrad $2
-; AVX: vpadd
-; AVX: vpmulld
 }
 
 define <2 x i16> @test12() {
+; SSE41-LABEL: test12:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test12:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0
   %I9 = insertelement <2 x i16> %I8, i16 -1, i32 1
   %B9 = urem <2 x i16> %I9, %I9
   ret <2 x i16> %B9
+}
 
-; AVX-LABEL: test12:
-; AVX: xorps
+define <4 x i32> @PR20355(<4 x i32> %a) {
+; SSE41-LABEL: PR20355:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm2, %xmm1
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE41-NEXT:    movaps %xmm0, %xmm1
+; SSE41-NEXT:    psrld $31, %xmm1
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: PR20355:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrad $31, %xmm3
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    paddd %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    psubd %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $31, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR20355:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[1,3],xmm0[1,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    vpsrld $31, %xmm0, %xmm1
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %sdiv = sdiv <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %sdiv
 }
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
new file mode 100644
index 0000000..7a329d7
--- /dev/null
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -0,0 +1,943 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+;
+; Just one 32-bit run to make sure we do reasonable things there.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=i686 -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
+
+define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i16_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:   # kill: XMM0<def> XMM1<kill>
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_8i16_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:   # kill: XMM0<def> XMM1<kill>
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    pslld $16, %xmm0
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    pslld $16, %xmm1
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_8i16_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxwd %xmm1, %xmm0
+; SSE41-NEXT:    pslld $16, %xmm0
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pslld $16, %xmm1
+; SSE41-NEXT:    psrad $16, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_8i16_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_8i16_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_8i16_to_8i32:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE41-NEXT:    pmovzxwd %xmm1, %xmm0
+; X32-SSE41-NEXT:    pslld $16, %xmm0
+; X32-SSE41-NEXT:    psrad $16, %xmm0
+; X32-SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; X32-SSE41-NEXT:    pslld $16, %xmm1
+; X32-SSE41-NEXT:    psrad $16, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <8 x i16> %A to <8 x i32>
+  ret <8 x i32>%B
+}
+
+define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_4i32_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_4i32_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSSE3-NEXT:    movd %xmm1, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_4i32_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxdq %xmm0, %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm3
+; SSE41-NEXT:    movd %xmm1, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm3
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm1
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_4i32_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_4i32_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_4i32_to_4i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    pmovzxdq %xmm0, %xmm2
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %ecx
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm2
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $3, %ecx, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; X32-SSE41-NEXT:    movd %xmm1, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pextrd $2, %xmm1, %ecx
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $3, %ecx, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <4 x i32> %A to <4 x i64>
+  ret <4 x i64>%B
+}
+
+define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_test1:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq (%rdi), %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_test1:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq (%rdi), %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_test1:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_test1:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_test1:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i16>* %ptr
+ %Y = sext <4 x i16> %X to <4 x i32>
+ ret <4 x i32>%Y
+}
+
+define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_test2:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd (%rdi), %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_test2:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movd (%rdi), %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_test2:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_test2:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_test2:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i8>* %ptr
+ %Y = sext <4 x i8> %X to <4 x i32>
+ ret <4 x i32>%Y
+}
+
+define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_test3:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsbq 1(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    movsbq (%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_test3:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsbq 1(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    movsbq (%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_test3:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_test3:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_test3:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i8>* %ptr
+ %Y = sext <2 x i8> %X to <2 x i64>
+ ret <2 x i64>%Y
+}
+
+define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_test4:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movswq 2(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    movswq (%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_test4:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movswq 2(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    movswq (%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_test4:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_test4:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_test4:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i16>* %ptr
+ %Y = sext <2 x i16> %X to <2 x i64>
+ ret <2 x i64>%Y
+}
+
+define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
+; SSE2-LABEL: load_sext_test5:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movslq 4(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    movslq (%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_test5:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movslq 4(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    movslq (%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_test5:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_test5:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_test5:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i32>* %ptr
+ %Y = sext <2 x i32> %X to <2 x i64>
+ ret <2 x i64>%Y
+}
+
+define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_test6:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq (%rdi), %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_test6:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq (%rdi), %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_test6:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_test6:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_test6:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <8 x i8>* %ptr
+ %Y = sext <8 x i8> %X to <8 x i16>
+ ret <8 x i16>%Y
+}
+
+define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
+; SSE2-LABEL: sext_4i1_to_4i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pslld $31, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_4i1_to_4i64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pslld $31, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSSE3-NEXT:    movd %xmm1, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_4i1_to_4i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pslld $31, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pmovzxdq %xmm0, %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm3
+; SSE41-NEXT:    movd %xmm1, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm3
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm1
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_4i1_to_4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_4i1_to_4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_4i1_to_4i64:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    pslld $31, %xmm0
+; X32-SSE41-NEXT:    psrad $31, %xmm0
+; X32-SSE41-NEXT:    pmovzxdq %xmm0, %xmm2
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %ecx
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm2
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $3, %ecx, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; X32-SSE41-NEXT:    movd %xmm1, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pextrd $2, %xmm1, %ecx
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $3, %ecx, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+  %extmask = sext <4 x i1> %mask to <4 x i64>
+  ret <4 x i64> %extmask
+}
+
+define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
+; SSE2-LABEL: sext_16i8_to_16i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psllw $8, %xmm1
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_16i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psllw $8, %xmm0
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    psllw $8, %xmm1
+; SSSE3-NEXT:    psraw $8, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_16i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxbw %xmm1, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    psraw $8, %xmm0
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_16i8_to_16i16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_16i8_to_16i16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_16i8_to_16i16:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movdqa (%eax), %xmm1
+; X32-SSE41-NEXT:    pmovzxbw %xmm1, %xmm0
+; X32-SSE41-NEXT:    psllw $8, %xmm0
+; X32-SSE41-NEXT:    psraw $8, %xmm0
+; X32-SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-SSE41-NEXT:    psllw $8, %xmm1
+; X32-SSE41-NEXT:    psraw $8, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <16 x i8>* %ptr
+ %Y = sext <16 x i8> %X to <16 x i16>
+ ret <16 x i16> %Y
+}
+
+define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
+; SSE2-LABEL: sext_4i8_to_4i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pslld $24, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_4i8_to_4i64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pslld $24, %xmm0
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSSE3-NEXT:    movd %xmm1, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_4i8_to_4i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pslld $24, %xmm0
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pmovzxdq %xmm0, %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm3
+; SSE41-NEXT:    movd %xmm1, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm3
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm1
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_4i8_to_4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_4i8_to_4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_4i8_to_4i64:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    pslld $24, %xmm0
+; X32-SSE41-NEXT:    psrad $24, %xmm0
+; X32-SSE41-NEXT:    pmovzxdq %xmm0, %xmm2
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %ecx
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm2
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $3, %ecx, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; X32-SSE41-NEXT:    movd %xmm1, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pextrd $2, %xmm1, %ecx
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $3, %ecx, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+  %extmask = sext <4 x i8> %mask to <4 x i64>
+  ret <4 x i64> %extmask
+}
+
+define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_4i8_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd (%rdi), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i8_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movd (%rdi), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i8_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxbd (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxdq %xmm1, %xmm0
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    movsbq %al, %rax
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    movsbq %al, %rax
+; SSE41-NEXT:    movd %rax, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    movsbq %al, %rax
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    movd %xmm1, %rax
+; SSE41-NEXT:    movsbq %al, %rax
+; SSE41-NEXT:    movd %rax, %xmm1
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i8_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i8_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movd (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovzxbd %xmm0, %xmm1
+; X32-SSE41-NEXT:    pmovzxbq %xmm0, %xmm2
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    movsbl %al, %eax
+; X32-SSE41-NEXT:    movd %eax, %xmm0
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm0
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %eax
+; X32-SSE41-NEXT:    movsbl %al, %eax
+; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    movsbl %al, %eax
+; X32-SSE41-NEXT:    movd %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %eax
+; X32-SSE41-NEXT:    movsbl %al, %eax
+; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i8>* %ptr
+ %Y = sext <4 x i8> %X to <4 x i64>
+ ret <4 x i64>%Y
+}
+
+define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_4i16_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq (%rdi), %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movswq %ax, %rax
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movswq %ax, %rax
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movswq %ax, %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movswq %ax, %rax
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i16_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq (%rdi), %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movswq %ax, %rax
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movswq %ax, %rax
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movswq %ax, %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movswq %ax, %rax
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i16_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movq (%rdi), %xmm0
+; SSE41-NEXT:    pmovzxwd %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    movswq %ax, %rax
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    movswq %ax, %rax
+; SSE41-NEXT:    movd %rax, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    movswq %ax, %rax
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    movd %xmm1, %rax
+; SSE41-NEXT:    movswq %ax, %rax
+; SSE41-NEXT:    movd %rax, %xmm1
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i16_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i16_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxwq (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movsd (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovzxwd %xmm0, %xmm1
+; X32-SSE41-NEXT:    pmovzxwq %xmm0, %xmm2
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    cwtl
+; X32-SSE41-NEXT:    movd %eax, %xmm0
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm0
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %eax
+; X32-SSE41-NEXT:    cwtl
+; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    cwtl
+; X32-SSE41-NEXT:    movd %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %eax
+; X32-SSE41-NEXT:    cwtl
+; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i16>* %ptr
+ %Y = sext <4 x i16> %X to <4 x i64>
+ ret <4 x i64>%Y
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 4da7e42..30ad366 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1,196 +1,1110 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; FIXME: SSE2 should look like the following:
+; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
+; FIXME:       # BB#0:
+; FIXME-NEXT:    punpcklbw %xmm0, %xmm0
+; FIXME-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; FIXME-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
+; FIXME-NEXT:    retq
+;
+; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,2,4,5,6,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    punpcklwd %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    punpckhwd %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_0101010101010101
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; FIXME: SSE2 should be the following:
+; FIXME-LABEL: @shuffle_v16i8_0101010101010101
+; FIXME:       # BB#0:
+; FIXME-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; FIXME-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
+; FIXME-NEXT:    retq
+;
+; SSE2-LABEL: shuffle_v16i8_0101010101010101:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_0101010101010101:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_0101010101010101:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v16i8_0101010101010101:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i8_0101010101010101:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23
-; CHECK-SSE2:         punpcklbw %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   ret <16 x i8> %shuffle
 }
 
+define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle
+}
+
 define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12
-; CHECK-SSE2:         pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    punpckhbw %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    packuswb %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20
-; CHECK-SSE2:         pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    packuswb %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20
-; CHECK-SSE2:         pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    punpckhbw %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    shufpd {{.*}} # xmm4 = xmm4[0],xmm3[1]
-; CHECK-SSE2-NEXT:    punpckhbw %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
-; CHECK-SSE2-NEXT:    packuswb %xmm4, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    movsd %xmm4, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    movsd %xmm0, %xmm1
+; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
   ret <16 x i8> %shuffle
 }
 
-define <16 x i8> @zext_to_v8i16_shuffle(<16 x i8> %a) {
-; CHECK-SSE2-LABEL: @zext_to_v8i16_shuffle
-; CHECK-SSE2:         pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
-  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
+define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
+; SSE2-LABEL: trunc_v4i32_shuffle:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_v4i32_shuffle:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_v4i32_shuffle:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc_v4i32_shuffle:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @stress_test0(<16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i8> %s.0.7, <16 x i8> %s.0.8, <16 x i8> %s.0.9) {
+; We don't have anything useful to check here. This generates 100s of
+; instructions. Instead, just make sure we survived codegen.
+; ALL-LABEL: stress_test0:
+; ALL:         retq
+entry:
+  %s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> <i32 1, i32 22, i32 21, i32 28, i32 3, i32 16, i32 6, i32 1, i32 19, i32 29, i32 12, i32 31, i32 2, i32 3, i32 3, i32 6>
+  %s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> <i32 31, i32 20, i32 12, i32 19, i32 2, i32 15, i32 12, i32 31, i32 2, i32 28, i32 2, i32 30, i32 7, i32 8, i32 17, i32 28>
+  %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> %s.0.9, <16 x i32> <i32 14, i32 10, i32 17, i32 5, i32 17, i32 9, i32 17, i32 21, i32 31, i32 24, i32 16, i32 6, i32 20, i32 28, i32 23, i32 8>
+  %s.2.2 = shufflevector <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i32> <i32 20, i32 9, i32 21, i32 11, i32 11, i32 4, i32 3, i32 18, i32 3, i32 30, i32 4, i32 31, i32 11, i32 24, i32 13, i32 29>
+  %s.3.2 = shufflevector <16 x i8> %s.2.2, <16 x i8> %s.1.4, <16 x i32> <i32 15, i32 13, i32 5, i32 11, i32 7, i32 17, i32 14, i32 22, i32 22, i32 16, i32 7, i32 24, i32 16, i32 22, i32 7, i32 29>
+  %s.5.4 = shufflevector <16 x i8> %s.1.5, <16 x i8> %s.1.8, <16 x i32> <i32 3, i32 13, i32 19, i32 7, i32 23, i32 11, i32 1, i32 9, i32 16, i32 25, i32 2, i32 7, i32 0, i32 21, i32 23, i32 17>
+  %s.6.1 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.3.2, <16 x i32> <i32 11, i32 2, i32 28, i32 31, i32 27, i32 3, i32 9, i32 27, i32 25, i32 25, i32 14, i32 7, i32 12, i32 28, i32 12, i32 23>
+  %s.7.1 = shufflevector <16 x i8> %s.6.1, <16 x i8> %s.3.2, <16 x i32> <i32 15, i32 29, i32 14, i32 0, i32 29, i32 15, i32 26, i32 30, i32 6, i32 7, i32 2, i32 8, i32 12, i32 10, i32 29, i32 17>
+  %s.7.2 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.5.4, <16 x i32> <i32 3, i32 29, i32 3, i32 19, i32 undef, i32 20, i32 undef, i32 3, i32 27, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s.16.0 = shufflevector <16 x i8> %s.7.1, <16 x i8> %s.7.2, <16 x i32> <i32 13, i32 1, i32 16, i32 16, i32 6, i32 7, i32 29, i32 18, i32 19, i32 28, i32 undef, i32 undef, i32 31, i32 1, i32 undef, i32 10>
+  ret <16 x i8> %s.16.0
+}
+
+define <16 x i8> @stress_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind {
+; There is nothing interesting to check about these instructions other than
+; that they survive codegen. However, we actually do better and delete all of
+; them because the result is 'undef'.
+;
+; ALL-LABEL: stress_test1:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    retq
+entry:
+  %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0>
+  %s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> <i32 21, i32 undef, i32 undef, i32 19, i32 undef, i32 undef, i32 29, i32 24, i32 21, i32 23, i32 21, i32 17, i32 19, i32 undef, i32 20, i32 22>
+  %s.2.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> undef, <16 x i32> <i32 3, i32 8, i32 undef, i32 7, i32 undef, i32 10, i32 8, i32 0, i32 15, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 9>
+  %s.2.9 = shufflevector <16 x i8> %s.0.9, <16 x i8> undef, <16 x i32> <i32 7, i32 undef, i32 14, i32 7, i32 8, i32 undef, i32 7, i32 8, i32 5, i32 15, i32 undef, i32 1, i32 11, i32 undef, i32 undef, i32 11>
+  %s.3.4 = shufflevector <16 x i8> %s.2.4, <16 x i8> %s.0.5, <16 x i32> <i32 5, i32 0, i32 21, i32 6, i32 15, i32 27, i32 22, i32 21, i32 4, i32 22, i32 19, i32 26, i32 9, i32 26, i32 8, i32 29>
+  %s.3.9 = shufflevector <16 x i8> %s.2.9, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 8, i32 1, i32 undef, i32 4, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 undef>
+  %s.4.7 = shufflevector <16 x i8> %s.1.8, <16 x i8> %s.2.9, <16 x i32> <i32 9, i32 0, i32 22, i32 20, i32 24, i32 7, i32 21, i32 17, i32 20, i32 12, i32 19, i32 23, i32 2, i32 9, i32 17, i32 10>
+  %s.4.8 = shufflevector <16 x i8> %s.2.9, <16 x i8> %s.3.9, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 10, i32 undef, i32 0, i32 5, i32 undef, i32 9, i32 undef>
+  %s.5.7 = shufflevector <16 x i8> %s.4.7, <16 x i8> %s.4.8, <16 x i32> <i32 16, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s.8.4 = shufflevector <16 x i8> %s.3.4, <16 x i8> %s.5.7, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 28, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s.9.4 = shufflevector <16 x i8> %s.8.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 5>
+  %s.10.4 = shufflevector <16 x i8> %s.9.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s.12.4 = shufflevector <16 x i8> %s.10.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef>
+
+  ret <16 x i8> %s.12.4
+}
+
+define <16 x i8> @PR20540(<8 x i8> %a) {
+; SSE2-LABEL: PR20540:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: PR20540:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: PR20540:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: PR20540:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
+; SSE2-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movzbl %dil, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 0
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
+; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movzbl %dil, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 0
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
+; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd %edi, %xmm0
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 0
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
+; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movzbl %dil, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 3
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <16 x i8> %shuffle
 }
 
-define <16 x i8> @zext_to_v4i32_shuffle(<16 x i8> %a) {
-; CHECK-SSE2-LABEL: @zext_to_v4i32_shuffle
-; CHECK-SSE2:         pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
+define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbd %xmm0, %xmm0
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
   ret <16 x i8> %shuffle
 }
 
-define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
-; CHECK-SSE2-LABEL: @trunc_v4i32_shuffle
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pand
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    packuswb %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    retq
-  %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbw %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbw %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbw %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbw %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
   ret <16 x i8> %shuffle
 }
+
+define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,0,3,1,4,5,6,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,1,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,1,4,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT:    packuswb %xmm0, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT:    packuswb %xmm0, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX-NEXT:    retq
+entry:
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
+
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) {
+; Nothing interesting to test here. Just make sure we didn't crashe.
+; ALL-LABEL: stress_test2:
+; ALL:         retq
+entry:
+  %s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> <i32 29, i32 30, i32 2, i32 16, i32 26, i32 21, i32 11, i32 26, i32 26, i32 3, i32 4, i32 5, i32 30, i32 28, i32 15, i32 5>
+  %s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> <i32 31, i32 1, i32 24, i32 12, i32 28, i32 5, i32 2, i32 9, i32 29, i32 1, i32 31, i32 5, i32 6, i32 17, i32 15, i32 22>
+  %s.2.0 = shufflevector <16 x i8> %s.1.0, <16 x i8> %s.1.1, <16 x i32> <i32 22, i32 1, i32 12, i32 3, i32 30, i32 4, i32 30, i32 undef, i32 1, i32 10, i32 14, i32 18, i32 27, i32 13, i32 16, i32 19>
+
+  ret <16 x i8> %s.2.0
+}
+
+define void @constant_gets_selected() {
+; ALL-LABEL: constant_gets_selected:
+; ALL-NOT movd $0, {{%xmm[0-9]+}}
+  %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8>
+  %shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+  %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32>
+  store <4 x i32> %weirder_zero, <4 x i32>* undef, align 16
+  store <4 x i32> zeroinitializer, <4 x i32>* undef, align 16
+  ret void
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 78b4ee7..9affee9 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -1,219 +1,1138 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 define <2 x i64> @shuffle_v2i64_00(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_00
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_00:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 0>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_10(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_10
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_10:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_10:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 0>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_11(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_11
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_11:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_11:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_22(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_22
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_22:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_22:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_22:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm0
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 2>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_32(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_32
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 2>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_33
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_33:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_33:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 3>
   ret <2 x i64> %shuffle
 }
 
 define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_00
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2f64_00:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2f64_00:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2f64_00:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2f64_00:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_00:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_10
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2f64_10:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_10:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 0>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_11
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2f64_11:
+; SSE:       # BB#0:
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_11:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
-; FIXME: Should these use movapd + shufpd to remove a domain change at the cost
-;        of a mov?
+; SSE2-LABEL: shuffle_v2f64_22:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2f64_22:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSE3-NEXT:    movapd %xmm1, %xmm0
+; SSE3-NEXT:    retq
 ;
-; CHECK-SSE2-LABEL: @shuffle_v2f64_22
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSSE3-LABEL: shuffle_v2f64_22:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSSE3-NEXT:    movapd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2f64_22:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_22:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_32
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2f64_32:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 2>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_33
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2f64_33:
+; SSE:       # BB#0:
+; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_33:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm1[1,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
   ret <2 x double> %shuffle
 }
+define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
+; SSE2-LABEL: shuffle_v2f64_03:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2f64_03:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsd %xmm0, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2f64_03:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2f64_03:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_03:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
+; SSE2-LABEL: shuffle_v2f64_21:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2f64_21:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2f64_21:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2f64_21:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_21:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
+  ret <2 x double> %shuffle
+}
 
 
 define <2 x i64> @shuffle_v2i64_02(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_02
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_02:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_02:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_02_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm2[0]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_02_copy:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_02_copy:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm2[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_03
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_03:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_03:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsd %xmm0, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_03:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_03:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_03:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_03:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_03_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_03_copy:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_03_copy:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsd %xmm1, %xmm2
+; SSE3-NEXT:    movaps %xmm2, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_03_copy:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm1, %xmm2
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_03_copy:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_03_copy:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_03_copy:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_12
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_12:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_12:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_12:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_12:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_12_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_12_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[1],xmm2[0]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_12_copy:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_12_copy:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
+; SSE3-NEXT:    movapd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_12_copy:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_12_copy:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_12_copy:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_13(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_13
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_13:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_13:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_13_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_13_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_13_copy:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_13_copy:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_20(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_20
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_20:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_20:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_20_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm2 = xmm2[0],xmm1[0]
-; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_20_copy:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_20_copy:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_21
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_21:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_21:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_21:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_21:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_21:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_21:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_21_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
-; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_21_copy:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_21_copy:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsd %xmm2, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_21_copy:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm2, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_21_copy:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_21_copy:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_21_copy:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_30(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_30
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[1],xmm0[0]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_30:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_30:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE3-NEXT:    movapd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_30:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_30:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_30:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_30_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_30_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm2 = xmm2[1],xmm1[0]
-; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_30_copy:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
+; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_30_copy:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
+; SSE3-NEXT:    movapd %xmm2, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_30_copy:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_30_copy:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_30_copy:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_31(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_31
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[1],xmm0[1]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_31:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_31:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_31_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm2 = xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_31_copy:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_31_copy:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm1[1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
   ret <2 x i64> %shuffle
 }
+
+define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) {
+; SSE-LABEL: shuffle_v2i64_0z:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_0z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) {
+; SSE-LABEL: shuffle_v2i64_1z:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_1z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) {
+; SSE-LABEL: shuffle_v2i64_z0:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq %xmm0, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_z0:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 0>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
+; SSE2-LABEL: shuffle_v2i64_z1:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_z1:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_z1:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_z1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_z1:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_z1:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 1>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_0z(<2 x double> %a) {
+; SSE-LABEL: shuffle_v2f64_0z:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_0z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_1z(<2 x double> %a) {
+; SSE-LABEL: shuffle_v2f64_1z:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_1z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) {
+; SSE-LABEL: shuffle_v2f64_z0:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_z0:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 0>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) {
+; SSE2-LABEL: shuffle_v2f64_z1:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2f64_z1:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2f64_z1:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2f64_z1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorpd %xmm1, %xmm1
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_z1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
+  ret <2 x double> %shuffle
+}
+
+define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) {
+; SSE-LABEL: insert_reg_and_zero_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %rdi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_and_zero_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %rdi, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <2 x i64> undef, i64 %a, i32 0
+  %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) {
+; SSE-LABEL: insert_mem_and_zero_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_and_zero_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %a = load i64* %ptr
+  %v = insertelement <2 x i64> undef, i64 %a, i32 0
+  %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x double> @insert_reg_and_zero_v2f64(double %a) {
+; SSE-LABEL: insert_reg_and_zero_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_and_zero_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) {
+; SSE-LABEL: insert_mem_and_zero_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movsd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_and_zero_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovsd (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) {
+; SSE2-LABEL: insert_reg_lo_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd %rdi, %xmm1
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_reg_lo_v2i64:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movd %rdi, %xmm1
+; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_reg_lo_v2i64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %rdi, %xmm1
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_reg_lo_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %rdi, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_reg_lo_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq %rdi, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_reg_lo_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq %rdi, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
+  %v = insertelement <2 x i64> undef, i64 %a, i32 0
+  %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) {
+; SSE2-LABEL: insert_mem_lo_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlpd (%rdi), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_mem_lo_v2i64:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movlpd (%rdi), %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_mem_lo_v2i64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movlpd (%rdi), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_mem_lo_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movq (%rdi), %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_mem_lo_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq (%rdi), %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_lo_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq (%rdi), %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
+  %a = load i64* %ptr
+  %v = insertelement <2 x i64> undef, i64 %a, i32 0
+  %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) {
+; SSE-LABEL: insert_reg_hi_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %rdi, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_hi_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %rdi, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %v = insertelement <2 x i64> undef, i64 %a, i32 0
+  %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) {
+; SSE-LABEL: insert_mem_hi_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq (%rdi), %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_hi_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq (%rdi), %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %a = load i64* %ptr
+  %v = insertelement <2 x i64> undef, i64 %a, i32 0
+  %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
+; SSE-LABEL: insert_reg_lo_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_lo_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) {
+; SSE-LABEL: insert_mem_lo_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movlpd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_lo_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovlpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_reg_hi_v2f64(double %a, <2 x double> %b) {
+; SSE-LABEL: insert_reg_hi_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_hi_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) {
+; SSE-LABEL: insert_mem_hi_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movhpd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_hi_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovhpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_dup_reg_v2f64(double %a) {
+; FIXME: We should match movddup for SSE3 and higher here.
+;
+; SSE2-LABEL: insert_dup_reg_v2f64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_dup_reg_v2f64:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_reg_v2f64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_reg_v2f64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: insert_dup_reg_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
+; SSE2-LABEL: insert_dup_mem_v2f64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd (%rdi), %xmm0
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_dup_mem_v2f64:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movddup (%rdi), %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_mem_v2f64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movddup (%rdi), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_mem_v2f64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movddup (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: insert_dup_mem_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovddup (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @shuffle_mem_v2f64_10(<2 x double>* %ptr) {
+; SSE-LABEL: shuffle_mem_v2f64_10:
+; SSE:       # BB#0:
+; SSE-NEXT:    movapd (%rdi), %xmm0
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_mem_v2f64_10:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = mem[1,0]
+; AVX-NEXT:    retq
+  %a = load <2 x double>* %ptr
+  %shuffle = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  ret <2 x double> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 7d496fa..833b822 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1,170 +1,1386 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0001
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,0,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0001:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0001:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0020
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,0,2,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0020:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0020:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   ret <4 x i32> %shuffle
 }
+define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: shuffle_v4i32_0112:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0112:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
+  ret <4 x i32> %shuffle
+}
 define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0300
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,3,0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0300:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0300:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_1000
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[1,0,0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_1000:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_1000:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_2200
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[2,2,0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_2200:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_2200:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_3330
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[3,3,3,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_3330:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_3330:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_3210
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[3,2,1,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_3210:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_3210:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x i32> %shuffle
 }
 
+define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: shuffle_v4i32_2121:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_2121:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1>
+  ret <4 x i32> %shuffle
+}
+
 define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_0001
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,0,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_0001:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0001:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
   ret <4 x float> %shuffle
 }
 define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_0020
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,0,2,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_0020:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0020:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   ret <4 x float> %shuffle
 }
 define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_0300
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,3,0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_0300:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0300:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
   ret <4 x float> %shuffle
 }
 define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_1000
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[1,0,0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_1000:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_1000:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   ret <4 x float> %shuffle
 }
 define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_2200
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[2,2,0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_2200:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_2200:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
   ret <4 x float> %shuffle
 }
 define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_3330
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[3,3,3,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_3330:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_3330:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
   ret <4 x float> %shuffle
 }
 define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_3210
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[3,2,1,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_3210:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_3210:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x float> %shuffle
 }
+define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: shuffle_v4f32_0011:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0011:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: shuffle_v4f32_2233:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_2233:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: shuffle_v4f32_0022:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_0022:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_0022:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_0022:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0022:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: shuffle_v4f32_1133:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_1133:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_1133:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_1133:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_1133:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x float> %shuffle
+}
 
 define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0124
-; CHECK-SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0]
-; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[2,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v4i32_0124:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_0124:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0124:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_0124:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0124:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0142
-; CHECK-SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0]
-; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,2]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0142:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0142:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0412
-; CHECK-SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[0,0]
-; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm1 = xmm1[2,0],xmm0[1,2]
-; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0412:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0412:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[1,2]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_4012
-; CHECK-SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[0,0]
-; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm1 = xmm1[0,2],xmm0[1,2]
-; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_4012:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_4012:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,2]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0145
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0145:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0145:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0451
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,1]
-; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,3,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0451:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0451:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_4501
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_4501:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_4501:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_4015
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,1]
-; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[2,0,1,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_4015:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_4015:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
   ret <4 x i32> %shuffle
 }
+
+define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_4zzz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_4zzz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_4zzz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_4zzz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_4zzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_z4zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_z4zz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_z4zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_z4zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_z4zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_zz4z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_zz4z:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_zz4z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_zz4z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_zz4z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_zuu4:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_zuu4:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_zuu4:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_zuu4:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_zuu4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_zzz7:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_zzz7:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_zzz7:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_zzz7:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_zzz7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_z6zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_z6zz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_z6zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_z6zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_z6zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_4zzz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_4zzz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_4zzz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_4zzz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_4zzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_z4zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_z4zz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_z4zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_z4zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_z4zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_zz4z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_zz4z:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_zz4z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_zz4z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_zz4z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_zuu4:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_zuu4:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_zuu4:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_zuu4:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_zuu4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,0]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_z6zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_z6zz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_z6zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_z6zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_z6zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_7012:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_7012:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_7012:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_7012:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_7012:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_6701:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_6701:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE3-NEXT:    movapd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_6701:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_6701:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_6701:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_5670:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_5670:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_5670:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_5670:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_5670:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_1234:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_1234:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_1234:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_1234:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_1234:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_2345:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_2345:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_2345:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_2345:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_2345:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_3456:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_3456:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_3456:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_3456:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_3456:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_0u1u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_0u1u:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0u1u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_0u1u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxdq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0u1u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxdq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_0z1z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_0z1z:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0z1z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_0z1z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxdq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0z1z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxdq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
+; SSE-LABEL: insert_reg_and_zero_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %edi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_and_zero_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x i32> undef, i32 %a, i32 0
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
+; SSE-LABEL: insert_mem_and_zero_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_and_zero_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %a = load i32* %ptr
+  %v = insertelement <4 x i32> undef, i32 %a, i32 0
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
+; SSE2-LABEL: insert_reg_and_zero_v4f32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_reg_and_zero_v4f32:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_reg_and_zero_v4f32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_reg_and_zero_v4f32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_and_zero_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x float> undef, float %a, i32 0
+  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
+; SSE-LABEL: insert_mem_and_zero_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_and_zero_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovss (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %a = load float* %ptr
+  %v = insertelement <4 x float> undef, float %a, i32 0
+  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
+; SSE2-LABEL: insert_reg_lo_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd %rdi, %xmm1
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_reg_lo_v4i32:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movd %rdi, %xmm1
+; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_reg_lo_v4i32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %rdi, %xmm1
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_reg_lo_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %rdi, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_reg_lo_v4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq %rdi, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_reg_lo_v4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq %rdi, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
+  %a.cast = bitcast i64 %a to <2 x i32>
+  %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
+; SSE2-LABEL: insert_mem_lo_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlpd (%rdi), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_mem_lo_v4i32:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movlpd (%rdi), %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_mem_lo_v4i32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movlpd (%rdi), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_mem_lo_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movq (%rdi), %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_mem_lo_v4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq (%rdi), %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_lo_v4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq (%rdi), %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
+  %a = load <2 x i32>* %ptr
+  %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
+; SSE-LABEL: insert_reg_hi_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %rdi, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_hi_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %rdi, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %a.cast = bitcast i64 %a to <2 x i32>
+  %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
+; SSE-LABEL: insert_mem_hi_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq (%rdi), %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_hi_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq (%rdi), %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %a = load <2 x i32>* %ptr
+  %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
+; SSE-LABEL: insert_reg_lo_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_lo_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %a.cast = bitcast double %a to <2 x float>
+  %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE-LABEL: insert_mem_lo_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movlpd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_lo_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovlpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a = load <2 x float>* %ptr
+  %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
+; SSE-LABEL: insert_reg_hi_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_hi_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %a.cast = bitcast double %a to <2 x float>
+  %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE-LABEL: insert_mem_hi_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movhpd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_hi_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovhpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a = load <2 x float>* %ptr
+  %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
+; SSE-LABEL: shuffle_mem_v4f32_3210:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_mem_v4f32_3210:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; AVX-NEXT:    retq
+  %a = load <4 x float>* %ptr
+  %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 5d1922a..59af434 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -1,493 +1,1941 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_01012323
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,0,1,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_01012323:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_01012323:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_67452301(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_67452301
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,2,1,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_67452301:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_67452301:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_456789AB
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_456789AB:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_456789AB:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_456789AB:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_456789AB:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_00000000
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_00000000:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_00000000:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_00000000:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v8i16_00000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i16_00000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_00004444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_00004444:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_00004444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
+define <8 x i16> @shuffle_v8i16_u0u1u2u3(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_u0u1u2u3:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u0u1u2u3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_u4u5u6u7(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_u4u5u6u7:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u4u5u6u7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
+  ret <8 x i16> %shuffle
+}
 define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_31206745
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,3,2]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_31206745:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_31206745:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_44440000(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_44440000
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,1,0,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_44440000:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_44440000:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_44440000:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_44440000:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i16> %shuffle
 }
+define <8 x i16> @shuffle_v8i16_23016745(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_23016745:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_23016745:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_23026745(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_23026745:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_23026745:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 5>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_23016747(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_23016747:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_23016747:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 7>
+  ret <8 x i16> %shuffle
+}
 define <8 x i16> @shuffle_v8i16_75643120(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_75643120
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_75643120:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_75643120:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_75643120:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_75643120:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 0>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_10545410(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_10545410
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_10545410:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_10545410:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_10545410:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_10545410:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 0>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_54105410(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_54105410
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_54105410:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_54105410:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_54105410:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_54105410:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 0>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_54101054(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_54101054
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_54101054:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_54101054:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_54101054:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_54101054:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 4>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_04400440(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_04400440
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,4,4,6]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_04400440:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,4,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_04400440:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_04400440:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_04400440:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 0>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_40044004(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_40044004
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[2,0,0,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_40044004:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_40044004:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_40044004:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_40044004:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 4>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_26405173
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_26405173:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_26405173:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_26405173:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_26405173:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 3>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_20645173
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_20645173:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_20645173:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_20645173:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_20645173:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 3>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_26401375
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_26401375:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_26401375:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_26401375:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_26401375:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5>
   ret <8 x i16> %shuffle
 }
 
+define <8 x i16> @shuffle_v8i16_66751643(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_66751643:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,3,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_66751643:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_66751643:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_66751643:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 3>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_60514754(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_60514754:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,5,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_60514754:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_60514754:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_60514754:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 6, i32 0, i32 5, i32 1, i32 4, i32 7, i32 5, i32 4>
+  ret <8 x i16> %shuffle
+}
+
 define <8 x i16> @shuffle_v8i16_00444444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_00444444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_00444444:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_00444444:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_00444444:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_00444444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_44004444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_44004444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[2,2,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_44004444:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_44004444:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_44004444:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_44004444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_04404444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_04404444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_04404444:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_04404444:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_04404444:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_04404444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_04400000(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_04400000
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,0,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_04400000:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_04400000:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_04400000:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_04400000:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_04404567
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_04404567:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_04404567:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_0X444444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0X444444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,1,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_0X444444:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0X444444:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0X444444:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0X444444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_44X04444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_44X04444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[2,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_44X04444:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_44X04444:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_44X04444:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_44X04444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_X4404444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_X4404444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_X4404444:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_X4404444:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_X4404444:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_X4404444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0127XXXX
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_0127XXXX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0127XXXX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0127XXXX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0127XXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXX4563
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_XXXX4563:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_XXXX4563:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_XXXX4563:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_XXXX4563:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 3>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_4563XXXX
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,0,2,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_4563XXXX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_4563XXXX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_4563XXXX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_4563XXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_01274563
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,4,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_01274563:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_01274563:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_01274563:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_01274563:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_45630127
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,0,1,3]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_45630127:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_45630127:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_45630127:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_45630127:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7>
   ret <8 x i16> %shuffle
 }
 
+define <8 x i16> @shuffle_v8i16_37102735(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_37102735:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_37102735:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_37102735:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_37102735:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 5>
+  ret <8 x i16> %shuffle
+}
+
 define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_08192a3b
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_08192a3b:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_08192a3b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d2e3f
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_0c1d2e3f:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0c1d2e3f:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_4c5d6e7f
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_4c5d6e7f:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_4c5d6e7f:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_48596a7b
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_48596a7b:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_48596a7b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_08196e7f
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[0,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_08196e7f:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_08196e7f:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 6, i32 14, i32 7, i32 15>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d6879
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,0,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_0c1d6879:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0c1d6879:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 6, i32 8, i32 7, i32 9>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_109832ba
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm0[2,0,3,1,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[2,0,3,1,4,5,6,7]
-; CHECK-SSE2-NEXT:    punpcklqdq %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_109832ba:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[2,0,3,1,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_109832ba:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[2,0,3,1,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_8091a2b3(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_8091a2b3
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklwd %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_8091a2b3:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_8091a2b3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_c4d5e6f7
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm2 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_c4d5e6f7:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_c4d5e6f7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0213cedf
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    punpcklqdq %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_0213cedf:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0213cedf:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15>
   ret <8 x i16> %shuffle
 }
 
+define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_443aXXXX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_443aXXXX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_443aXXXX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_443aXXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
 define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_032dXXXX
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,1,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_032dXXXX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_032dXXXX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_032dXXXX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_032dXXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
-define <8 x i16> @shuffle_v8i16_XXXcXXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXcXXXX
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm1[2,1,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
+define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_XXXdXXXX:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_XXXdXXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_012dXXXX
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_012dXXXX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_012dXXXX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_012dXXXX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_012dXXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXXcde3
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
-; CHECK-SSE2-NEXT:    punpckhwd %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,2]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_XXXXcde3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_XXXXcde3:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_XXXXcde3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v8i16_XXXXcde3:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i16_XXXXcde3:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_cde3XXXX
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
-; CHECK-SSE2-NEXT:    punpckhwd %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_cde3XXXX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_cde3XXXX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_cde3XXXX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v8i16_cde3XXXX:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i16_cde3XXXX:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_012dcde3
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm2 = xmm0[0,1,2,1]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm3 = xmm1[2,1,2,3]
-; CHECK-SSE2-NEXT:    punpckhwd %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm3, %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    punpcklqdq %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_012dcde3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_012dcde3:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_012dcde3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v8i16_012dcde3:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i16_012dcde3:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm2
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3>
   ret <8 x i16> %shuffle
 }
+
+define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_XXX1X579:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_XXX1X579:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_XXX1X579:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_XXX1X579:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_XX4X8acX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_XX4X8acX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_XX4X8acX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_8zzzzzzz:
+; SSE:       # BB#0:
+; SSE-NEXT:    movzwl %di, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_8zzzzzzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    movzwl %di, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 0
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_z8zzzzzz:
+; SSE:       # BB#0:
+; SSE-NEXT:    movzwl %di, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_z8zzzzzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    movzwl %di, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 0
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_zzzzz8zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    movzwl %di, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_zzzzz8zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    movzwl %di, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 0
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_zuuzuuz8:
+; SSE:       # BB#0:
+; SSE-NEXT:    movzwl %di, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_zuuzuuz8:
+; AVX:       # BB#0:
+; AVX-NEXT:    movzwl %di, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; AVX-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 0
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_zzBzzzzz:
+; SSE:       # BB#0:
+; SSE-NEXT:    movzwl %di, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_zzBzzzzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    movzwl %di, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 3
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_def01234(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_def01234:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_def01234:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_def01234:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_def01234:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_ueuu123u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_ueuu123u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_ueuu123u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_ueuu123u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_ueuu123u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_56701234(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_56701234:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_56701234:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_56701234:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_56701234:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u6uu123u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u6uu123u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u6uu123u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_u6uu123u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u6uu123u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_uuuu123u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_uuuu123u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_uuuu123u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_uuuu123u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_uuuu123u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_bcdef012(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_bcdef012:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_bcdef012:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_bcdef012:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_bcdef012:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_ucdeuu1u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_ucdeuu1u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_ucdeuu1u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_ucdeuu1u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_ucdeuu1u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 1, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_34567012(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_34567012:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_34567012:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_34567012:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_34567012:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u456uu1u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u456uu1u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u456uu1u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_u456uu1u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u456uu1u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 1, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u456uuuu(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u456uuuu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u456uuuu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_u456uuuu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u456uuuu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_3456789a(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_3456789a:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_3456789a:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_3456789a:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_3456789a:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u456uu9u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u456uu9u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u456uu9u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_u456uu9u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u456uu9u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 9, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_56789abc(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_56789abc:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_56789abc:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_56789abc:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_56789abc:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u6uu9abu(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u6uu9abu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u6uu9abu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_u6uu9abu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u6uu9abu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0uuu1uuu(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0uuu1uuu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0uuu1uuu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0uuu1uuu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0uuu1uuu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxwq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0zzz1zzz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0zzz1zzz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0zzz1zzz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0zzz1zzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxwq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0u1u2u3u(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0u1u2u3u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0u1u2u3u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0u1u2u3u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxwd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0u1u2u3u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxwd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0z1z2z3z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0z1z2z3z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0z1z2z3z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxwd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0z1z2z3z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxwd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+  ret <8 x i16> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
new file mode 100644
index 0000000..4db0280
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -0,0 +1,1267 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+
+target triple = "x86_64-unknown-unknown"
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,4,5,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,6,7,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,10,11,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[14,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,7,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,0,1,u,u,0,1,u,u,0,1,u,u,16,17,u,u,16,17,u,u,16,17,u,u,16,17]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 24, i32 24, i32 24, i32 24, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,14,15,12,13,10,11,8,9,u,u,u,u,u,u,u,u,30,31,28,29,26,27,24,25]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 7, i32 6, i32 5, i32 4, i32 27, i32 26, i32 25, i32 24, i32 15, i32 14, i32 13, i32 12>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 3, i32 2, i32 1, i32 0, i32 27, i32 26, i32 25, i32 24, i32 11, i32 10, i32 9, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 10, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 13, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 14, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 9, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 10, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 13, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 14, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 14, i32 14, i32 12, i32 12, i32 10, i32 10, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 14, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,2,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 2, i32 4, i32 4, i32 undef, i32 6, i32 14, i32 14, i32 undef, i32 12, i32 10, i32 10, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 12, i32 12>
+  ret <16 x i16> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
new file mode 100644
index 0000000..79c906b
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -0,0 +1,1562 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+target triple = "x86_64-unknown-unknown"
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movl $15, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    movl $15, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2],zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,2,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,3,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,4,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,5,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[6],zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,6,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[7],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,7,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,xmm2[8],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,8,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[9],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,9,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,10,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,11,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,12,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,13,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm2[14],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    movl $128, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    movl $15, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vinserti128 $0, %xmm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15,23,23,23,23,23,23,23,23,31,31,31,31,31,31,31,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15,19,19,19,19,23,23,23,23,27,27,27,27,31,31,31,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15, i32 19, i32 19, i32 19, i32 19, i32 23, i32 23, i32 23, i32 23, i32 27, i32 27, i32 27, i32 27, i32 31, i32 31, i32 31, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15,17,17,19,19,21,21,23,23,25,25,27,27,29,29,31,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15, i32 17, i32 17, i32 19, i32 19, i32 21, i32 21, i32 23, i32 23, i32 25, i32 25, i32 27, i32 27, i32 29, i32 29, i32 31, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movl $15, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    movl $15, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 33, i32 2, i32 35, i32 4, i32 37, i32 6, i32 39, i32 8, i32 41, i32 10, i32 43, i32 12, i32 45, i32 14, i32 47, i32 16, i32 49, i32 18, i32 51, i32 20, i32 53, i32 22, i32 55, i32 24, i32 57, i32 26, i32 59, i32 28, i32 61, i32 30, i32 63>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,8,9,10,11,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,15,14,13,12,11,10,9,8]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u,31,30,29,28,27,26,25,24]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,7,6,5,4,3,2,1,0]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,18,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 18, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,30,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 30, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movl $15, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 31, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,17,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 17, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,18,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 18, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,30,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movl $15, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,28,28,28,28,24,24,24,24,20,20,20,20,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 28, i32 28, i32 28, i32 28, i32 24, i32 24, i32 24, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,u,u,u,u,u,0,0,0,0,0,14,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,u,u,u,u,u,16,16,16,16,16,30,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[14,14,1,1,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,u,0,u,u,u,u,0,0,0,0,0,0,14,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,14,u,u,0,0,0,0,0,0,0,0,0,0,0,0,16,16,u,16,u,u,u,u,16,16,16,16,16,16,30,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 undef, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,u,u,u,4,u,8,8,8,8,u,u,12,u,28,28,28,28,u,u,u,24,20,20,20,20,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 undef, i32 28, i32 28, i32 28, i32 28, i32 undef, i32 undef, i32 undef, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,8,8,9,9,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,u,u,u,u,u,u,u,u,16,16,16,u,u,u,u,u,u,u,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u],zero,xmm0[u,u,u,u,u,u,u,7,u,u,u,u]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[4,3,u,3,u,u,u,u,u,u,u],zero,xmm3[u,u,u,u]
+; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1],zero,xmm2[3],zero,zero,zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,4,u,1,6],zero,zero,xmm4[0],zero,xmm4[11,u],zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7]
+; AVX1-NEXT:    vpor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2],zero,xmm5[4,5,6,7,8,9,10],zero,xmm5[12,13,14,15]
+; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[u,u,u,u,1,6,13,u,u],zero,xmm3[u,u]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,3],zero,zero,zero,zero,xmm0[8,9,10],zero,zero,xmm0[13],zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm4[u,u],zero,zero,xmm4[12],zero,xmm4[u,u,u],zero,zero,xmm4[u,0,3]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero
+; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[4,5,6,7],zero,zero,zero,xmm1[11,12],zero,xmm1[14,15]
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,u,u,255,255,0,255,u,u,u,255,255,u,0,0,u,u,255,u,255,255,0,0,255,0,255,u,0,0,0,0>
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 42, i32 45, i32 12, i32 13, i32 35, i32 35, i32 60, i32 40, i32 17, i32 22, i32 29, i32 44, i32 33, i32 12, i32 48, i32 51, i32 20, i32 19, i32 52, i32 19, i32 49, i32 54, i32 37, i32 32, i32 48, i32 42, i32 59, i32 7, i32 36, i32 34, i32 36, i32 39>
+  ret <32 x i8> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
new file mode 100644
index 0000000..0bd1bd9
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -0,0 +1,748 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+target triple = "x86_64-unknown-unknown"
+
+define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_0000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0001:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_0001:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0020:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_0020:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0300:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_0300:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_1000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_1000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_2200:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_2200:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_3330:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_3330:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_3210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_3210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0023:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0022:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1032:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1133:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1023:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1022:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0423:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_0423:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd %xmm1, %ymm1
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0462:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0426:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1537(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1537:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_4062(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_4062:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_5173(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_5173:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_5163:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0527:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_4163:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0145:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_4501:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0167:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0001:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0001:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0020:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0020:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0112:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0112:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0300:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0300:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_1000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_1000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_2200:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_2200:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_3330:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_3330:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_3210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_3210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0124:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0124:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0142:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0142:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0412:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0412:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_4012:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_4012:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) {
+; ALL-LABEL: shuffle_v4i64_0145:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0451:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0451:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) {
+; ALL-LABEL: shuffle_v4i64_4501:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_4015:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_4015:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_2u35:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_2u35:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_1251:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[0],ymm0[2],ymm2[3]
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_1251:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: stress_test1:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm0[1,0,3,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
+; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: stress_test1:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[3,1,1,0]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,1,3]
+; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-NEXT:    retq
+  %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+  %d = shufflevector <4 x i64> %c, <4 x i64> undef, <4 x i32> <i32 3, i32 undef, i32 2, i32 undef>
+  %e = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 undef>
+  %f = shufflevector <4 x i64> %d, <4 x i64> %e, <4 x i32> <i32 5, i32 1, i32 1, i32 0>
+
+  ret <4 x i64> %f
+}
+
+define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
+; AVX1-LABEL: insert_reg_and_zero_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq %rdi, %xmm0
+; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_reg_and_zero_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq %rdi, %xmm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+  %v = insertelement <4 x i64> undef, i64 %a, i64 0
+  %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
+; AVX1-LABEL: insert_mem_and_zero_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq (%rdi), %xmm0
+; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_and_zero_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq (%rdi), %xmm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+  %a = load i64* %ptr
+  %v = insertelement <4 x i64> undef, i64 %a, i64 0
+  %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
+; ALL-LABEL: insert_reg_and_zero_v4f64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; ALL-NEXT:    retq
+  %v = insertelement <4 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
+; ALL-LABEL: insert_mem_and_zero_v4f64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovsd (%rdi), %xmm0
+; ALL-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <4 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @splat_mem_v4f64(double* %ptr) {
+; ALL-LABEL: splat_mem_v4f64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
+; ALL-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <4 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x i64> @splat_mem_v4i64(i64* %ptr) {
+; AVX1-LABEL: splat_mem_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovddup (%rdi), %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splat_mem_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+  %a = load i64* %ptr
+  %v = insertelement <4 x i64> undef, i64 %a, i64 0
+  %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x double> @splat_mem_v4f64_2(double* %p) {
+; ALL-LABEL: splat_mem_v4f64_2:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
+; ALL-NEXT:    retq
+  %1 = load double* %p
+  %2 = insertelement <2 x double> undef, double %1, i32 0
+  %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %3
+}
+
+define <4 x double> @splat_v4f64(<2 x double> %r) {
+; AVX1-LABEL: splat_v4f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splat_v4f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %1
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
new file mode 100644
index 0000000..ded8232
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -0,0 +1,1931 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+
+target triple = "x86_64-unknown-unknown"
+
+define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00000010:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00000010:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00000200:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00000200:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00003000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00003000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00040000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00040000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00500000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,u,1,u,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00500000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_06000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,u,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,0,4,5,4,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_06000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_70000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,u,u,u,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_70000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    movl $7, %eax
+; AVX2-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $0, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_01014545:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00112233:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00112233:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00001111:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00001111:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_81a3c5e7(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_81a3c5e7:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_08080808:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_08080808:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss %xmm1, %ymm1
+; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_08084c4c:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_8823cc67:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_9832dc76:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_9810dc54:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08194c5d(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_08194c5d:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_2a3b6e7f(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_2a3b6e7f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_08192a3b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_08192a3b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3>
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_08991abb:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_08991abb:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_091b2d3f:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_091b2d3f:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_09ab1def:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_09ab1def:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00014445(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00014445:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00204464(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00204464:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_03004744(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_03004744:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10005444(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10005444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_22006644(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_22006644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_33307774(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_33307774:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_32107654(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_32107654:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00234467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00224466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10325476:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_11335577:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10235467(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10235467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10225466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00015444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00204644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_03004474:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10004444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_22006446:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_33307474:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_32104567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00236744:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00226644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10324567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_11334567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_01235467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_01235466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_002u6u44:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00uu66uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_103245uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_1133uu67:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_0uu354uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_uuu3uu66:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_c348cda0:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[0,0],ymm0[4,7],ymm2[4,4]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_c348cda0:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,3,4,u,u,u,u,0>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_f511235a:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,1,4,5,5,5]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_f511235a:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2>
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,5,1,1,2,3,5,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_32103210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_32103210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_76547654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_76547654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_76543210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_76543210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_3210ba98:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_3210ba98:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,u,u,3,2,1,0>
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_3210fedc:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_3210fedc:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_7654fedc:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_7654fedc:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_fedc7654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_fedc7654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_ba987654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_ba987654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_ba983210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_ba983210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00000010:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00000010:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00000200:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00000200:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00003000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00003000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00040000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00040000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00500000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,u,1,u,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00500000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_06000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,u,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,0,4,5,4,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_06000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_70000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,u,u,u,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_70000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    movl $7, %eax
+; AVX2-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $0, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_01014545:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_01014545:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00112233:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00112233:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00001111:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00001111:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_81a3c5e7(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_81a3c5e7:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_81a3c5e7:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08080808:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_08080808:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08084c4c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_08084c4c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_8823cc67(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_8823cc67:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_8823cc67:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_9832dc76(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_9832dc76:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_9832dc76:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_9810dc54(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_9810dc54:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_9810dc54:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,0,4,5,5,4]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08194c5d(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08194c5d:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_08194c5d:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_2a3b6e7f(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_2a3b6e7f:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_2a3b6e7f:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08192a3b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_08192a3b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3>
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08991abb:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_08991abb:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_091b2d3f:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_091b2d3f:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_09ab1def:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_09ab1def:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00014445(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00014445:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00014445:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00204464(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00204464:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00204464:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_03004744(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_03004744:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_03004744:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10005444(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10005444:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_10005444:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_22006644(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_22006644:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_22006644:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_33307774(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_33307774:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_33307774:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_32107654(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_32107654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_32107654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00234467(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00234467:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00234467:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00224466(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00224466:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00224466:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10325476(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10325476:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_10325476:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_11335577(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_11335577:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_11335577:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10235467(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10235467:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_10235467:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10225466:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_10225466:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00015444:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00015444:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00204644:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00204644:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_03004474:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_03004474:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10004444:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_10004444:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_22006446:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_22006446:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_33307474:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_33307474:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_32104567:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_32104567:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00236744:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00236744:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00226644:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00226644:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10324567:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_10324567:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_11334567:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_11334567:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_01235467:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_01235467:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_01235466:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_01235466:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_002u6u44:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_002u6u44:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4>
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00uu66uu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00uu66uu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_103245uu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_103245uu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_1133uu67:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_1133uu67:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7>
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_0uu354uu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_0uu354uu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_uuu3uu66:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_uuu3uu66:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6>
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_6caa87e5:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,2],ymm2[4,4],ymm1[6,6]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_6caa87e5:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,4,2,2,0,u,6,u>
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,1,3,2]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_32103210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_32103210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_76547654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_76547654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_76543210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_76543210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_3210ba98:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_3210ba98:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,3,2,1,0>
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_3210fedc:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_3210fedc:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_7654fedc:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_7654fedc:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_fedc7654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_fedc7654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_ba987654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_ba987654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_ba983210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_ba983210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x float> @splat_mem_v8f32_2(float* %p) {
+; ALL-LABEL: splat_mem_v8f32_2:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastss (%rdi), %ymm0
+; ALL-NEXT:    retq
+  %1 = load float* %p
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %3 = shufflevector <4 x float> %2, <4 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %3
+}
+
+define <8 x float> @splat_v8f32(<4 x float> %r) {
+; AVX1-LABEL: splat_v8f32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splat_v8f32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %1
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
new file mode 100644
index 0000000..8f87c7c
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -0,0 +1,1429 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
+target triple = "x86_64-unknown-unknown"
+
+define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00000000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00000010:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00000200:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00003000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00040000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00500000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_06000000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_70000000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_01014545:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00112233:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00001111:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_81a3c5e7:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08080808:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08084c4c:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vbroadcastsd %xmm3, %ymm3
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3]
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_8823cc67:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vbroadcastsd %xmm3, %ymm3
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; ALL-NEXT:    vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_9832dc76:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_9810dc54:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,0]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08194c5d:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_2a3b6e7f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08192a3b:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm1[0,2,2,3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08991abb:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm0[1,0,2,2]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm1[0,2,3,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_091b2d3f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_09ab1def:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00014445:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00204464:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_03004744:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10005444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_22006644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_33307774:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_32107654:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00234467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00224466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10325476:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_11335577:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10235467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10225466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00015444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00204644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_03004474:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10004444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_22006446:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_33307474:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,3,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_32104567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00236744:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00226644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10324567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_11334567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_01235467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_01235466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_002u6u44:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00uu66uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_103245uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_1133uu67:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_0uu354uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_uuu3uu66:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_c348cda0:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[0,1],ymm2[0,1]
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vbroadcastsd %xmm1, %ymm4
+; ALL-NEXT:    vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3]
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_f511235a:
+; ALL:       # BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT:    vpermpd {{.*#+}} ymm4 = ymm3[0,1,1,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm4 = ymm1[0,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
+  ret <8 x double> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00000000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00000010:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00000200:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00003000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00040000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00500000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_06000000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_70000000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_01014545:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
+; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00112233:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00001111:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_81a3c5e7:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08080808:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08084c4c:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vinserti128 $1, %xmm2, %ymm2, %ymm2
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
+; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_8823cc67:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; ALL-NEXT:    vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_9832dc76:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_9810dc54:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,0]
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08194c5d:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_2a3b6e7f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08192a3b:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[0,2,2,3]
+; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08991abb:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,2,3,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_091b2d3f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_09ab1def:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00014445:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00204464:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_03004744:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10005444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_22006644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_33307774:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_32107654:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00234467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00224466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10325476:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_11335577:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10235467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,0,2,3]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10225466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,0,2,2]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00015444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00204644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_03004474:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,3,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10004444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_22006446:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,0,0,2]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_33307474:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,0,3,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_32104567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00236744:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00226644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10324567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_11334567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_01235467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,0,2,3]
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_01235466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_002u6u44:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00uu66uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_103245uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_1133uu67:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_0uu354uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_uuu3uu66:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_6caa87e5:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[0,1,0,1]
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
+; ALL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; ALL-NEXT:    vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
+  ret <8 x i64> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index e60ecb7..22a6749 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1,6 +1,14 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+;
+; Verify that the DAG combiner correctly folds bitwise operations across
+; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
+; basic and always-safe patterns. Also test that the DAG combiner will combine
+; target-specific shuffle instructions where reasonable.
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
@@ -8,57 +16,72 @@ declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
 
 define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd1
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    retq
-  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
-  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 
+; ALL-LABEL: combine_pshufd1:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    retq
+entry:
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
+  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
   ret <4 x i32> %c
 }
 
 define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd2
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    retq
-  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+; ALL-LABEL: combine_pshufd2:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    retq
+entry:
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
   %b.cast = bitcast <4 x i32> %b to <8 x i16>
   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
   %c.cast = bitcast <8 x i16> %c to <4 x i32>
-  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
   ret <4 x i32> %d
 }
 
 define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd3
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    retq
-  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+; ALL-LABEL: combine_pshufd3:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    retq
+entry:
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
   %b.cast = bitcast <4 x i32> %b to <8 x i16>
   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
   %c.cast = bitcast <8 x i16> %c to <4 x i32>
-  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
   ret <4 x i32> %d
 }
 
 define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd4
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    retq
-  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 
+; SSE-LABEL: combine_pshufd4:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pshufd4:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; AVX-NEXT:    retq
+entry:
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
   %b.cast = bitcast <4 x i32> %b to <8 x i16>
   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
   %c.cast = bitcast <8 x i16> %c to <4 x i32>
-  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
   ret <4 x i32> %d
 }
 
 define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd5
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
-  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 
+; SSE-LABEL: combine_pshufd5:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pshufd5:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX-NEXT:    retq
+entry:
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
   %b.cast = bitcast <4 x i32> %b to <8 x i16>
   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
   %c.cast = bitcast <8 x i16> %c to <4 x i32>
@@ -67,53 +90,2458 @@ define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
 }
 
 define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd6
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd $0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: combine_pshufd6:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pshufd6:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT:    retq
+entry:
   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
   %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
   ret <4 x i32> %c
 }
 
 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
-; CHECK-SSE2-LABEL: @combine_pshuflw1
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    retq
-  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 
-  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 
+; ALL-LABEL: combine_pshuflw1:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    retq
+entry:
+  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
+  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
   ret <8 x i16> %c
 }
 
 define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
-; CHECK-SSE2-LABEL: @combine_pshuflw2
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    retq
+; ALL-LABEL: combine_pshuflw2:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    retq
+entry:
   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
-  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) 
-  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 
+  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
+  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
   ret <8 x i16> %d
 }
 
 define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
-; CHECK-SSE2-LABEL: @combine_pshuflw3
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: combine_pshuflw3:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pshuflw3:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; AVX-NEXT:    retq
+entry:
   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
-  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) 
-  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 
+  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
+  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
   ret <8 x i16> %d
 }
 
 define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufhw1
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: combine_pshufhw1:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pshufhw1:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX-NEXT:    retq
+entry:
   %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
-  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 
-  %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) 
+  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
+  %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
   ret <8 x i16> %d
 }
 
+define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test1:
+; SSE:       # BB#0:
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test2:
+; SSE:       # BB#0:
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test3:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test4:
+; SSE:       # BB#0:
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test5:
+; SSE:       # BB#0:
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test6:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+
+
+; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
+; are not performing a swizzle operations.
+
+define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test1b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test1b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    andps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test1b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test1b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test1b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test2b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test2b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test2b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test2b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test2b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test3b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test3b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm0
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test3b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test3b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test3b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test4b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test4b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    andps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test4b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test4b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test4b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test5b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test5b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test5b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test5b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test5b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test6b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test6b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm0
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test6b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test6b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test6b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX2-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test1c:
+; SSE:       # BB#0:
+; SSE-NEXT:    andps %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test1c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test2c:
+; SSE:       # BB#0:
+; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test2c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test3c:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorps %xmm1, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test3c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test4c:
+; SSE:       # BB#0:
+; SSE-NEXT:    andps %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test4c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test5c:
+; SSE:       # BB#0:
+; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test5c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test6c:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorps %xmm1, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test6c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test1:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test2:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test3:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test4:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test4:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test4:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test5:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test6:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test7:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test9:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test9:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test10:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test10:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test11:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test11:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test12:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test12:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test12:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
+  ret <4 x i32> %2
+}
+
+; The following pair of shuffles is folded into vector %A.
+define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
+; ALL-LABEL: combine_nested_undef_test13:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
+  ret <4 x i32> %2
+}
+
+; The following pair of shuffles is folded into vector %B.
+define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test14:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
+  ret <4 x i32> %2
+}
+
+
+; Verify that we don't optimize the following cases. We expect more than one shuffle.
+;
+; FIXME: Many of these already don't make sense, and the rest should stop
+; making sense with th enew vector shuffle lowering. Revisit at least testing for
+; it.
+
+define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test15:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test15:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[3,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
+; SSE2-LABEL: combine_nested_undef_test16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_nested_undef_test16:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_nested_undef_test16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test17:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test17:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test18:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test18:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test19:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test19:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test20:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test20:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test21:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test21:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+
+; Test that we correctly combine shuffles according to rule
+;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
+
+define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test22:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test22:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test23:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test23:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test24:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test24:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test25:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test25:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test25:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test26:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test26:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test27:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test27:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test27:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test28:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test28:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
+  ret <4 x i32> %2
+}
+
+define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test1:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test1:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test2:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test2:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test2:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test3:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test4:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test5:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test5:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm1, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test5:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test6:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test6:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test6:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test7:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test7:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test7:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_test7:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test7:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test8:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test9:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test9:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test10:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test10:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm1, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test10:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_test10:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test10:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %2
+}
+
+define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
+; ALL-LABEL: combine_test11:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test12:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test12:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test12:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test13:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test13:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test14:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test15:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test15:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm0, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test15:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test15:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
+; ALL-LABEL: combine_test16:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test17:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test17:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test17:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_test17:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test17:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test18:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test18:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test19:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test19:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test20:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test20:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm0, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test20:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_test20:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test20:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+
+; Check some negative cases.
+; FIXME: Do any of these really make sense? Are they redundant with the above tests?
+
+define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test1b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test1b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test1b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test1b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test2b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test2b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSSE3-NEXT:    movapd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test2b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test2b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test3b:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm1, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test3b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm0[3,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test4b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test4b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test4b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test4b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
+  ret <4 x float> %2
+}
+
+
+; Verify that we correctly fold shuffles even when we use illegal vector types.
+
+define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: combine_test1c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd (%rdi), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movd (%rsi), %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    movss %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test1c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd (%rdi), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movd (%rsi), %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    movss %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test1c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxbd (%rsi), %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_test1c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxbd (%rdi), %xmm0
+; AVX1-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test1c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxbd (%rdi), %xmm0
+; AVX2-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    retq
+  %A = load <4 x i8>* %a
+  %B = load <4 x i8>* %b
+  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x i8> %2
+}
+
+define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: combine_test2c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd (%rdi), %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    movd (%rsi), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test2c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd (%rdi), %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    movd (%rsi), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test2c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd (%rdi), %xmm0
+; SSE41-NEXT:    pmovzxbd (%rsi), %xmm1
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test2c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbd (%rdi), %xmm0
+; AVX-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %A = load <4 x i8>* %a
+  %B = load <4 x i8>* %b
+  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
+  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+  ret <4 x i8> %2
+}
+
+define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: combine_test3c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd (%rdi), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movd (%rsi), %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test3c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd (%rdi), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movd (%rsi), %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test3c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxbd (%rsi), %xmm0
+; SSE41-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test3c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbd (%rdi), %xmm0
+; AVX-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %A = load <4 x i8>* %a
+  %B = load <4 x i8>* %b
+  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i8> %2
+}
+
+define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: combine_test4c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd (%rdi), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movd (%rsi), %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test4c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd (%rdi), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movd (%rsi), %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test4c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxbd (%rsi), %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_test4c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxbd (%rdi), %xmm0
+; AVX1-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test4c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxbd (%rdi), %xmm0
+; AVX2-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT:    retq
+  %A = load <4 x i8>* %a
+  %B = load <4 x i8>* %b
+  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i8> %2
+}
+
+
+; The following test cases are generated from this C++ code
+;
+;__m128 blend_01(__m128 a, __m128 b)
+;{
+;  __m128 s = a;
+;  s = _mm_blend_ps( s, b, 1<<0 );
+;  s = _mm_blend_ps( s, b, 1<<1 );
+;  return s;
+;}
+;
+;__m128 blend_02(__m128 a, __m128 b)
+;{
+;  __m128 s = a;
+;  s = _mm_blend_ps( s, b, 1<<0 );
+;  s = _mm_blend_ps( s, b, 1<<2 );
+;  return s;
+;}
+;
+;__m128 blend_123(__m128 a, __m128 b)
+;{
+;  __m128 s = a;
+;  s = _mm_blend_ps( s, b, 1<<1 );
+;  s = _mm_blend_ps( s, b, 1<<2 );
+;  s = _mm_blend_ps( s, b, 1<<3 );
+;  return s;
+;}
+
+; Ideally, we should collapse the following shuffles into a single one.
+
+define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_blend_01:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_blend_01:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_blend_01:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_blend_01:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
+  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x float> %shuffle6
+}
+
+define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_blend_02:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_blend_02:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_blend_02:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_blend_02:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
+  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x float> %shuffle6
+}
+
+define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_blend_123:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_blend_123:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_blend_123:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_blend_123:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
+  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %shuffle12
+}
+
+define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test_movhl_1:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test_movhl_1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test_movhl_2:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test_movhl_2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test_movhl_3:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test_movhl_3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
+  ret <4 x i32> %2
+}
+
+
+; Verify that we fold shuffles according to rule:
+;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
+
+define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_undef_input_test1:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test1:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test2:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test3:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test4:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_undef_input_test5:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test5:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test5:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+
+; Verify that we fold shuffles according to rule:
+;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
+
+define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
+; ALL-LABEL: combine_undef_input_test6:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
+; SSE2-LABEL: combine_undef_input_test7:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test7:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test7:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
+; SSE2-LABEL: combine_undef_input_test8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test8:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
+; SSE-LABEL: combine_undef_input_test9:
+; SSE:       # BB#0:
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test9:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
+; ALL-LABEL: combine_undef_input_test10:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_undef_input_test11:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test11:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test11:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test11:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
+  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test12:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test13:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test13:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test14:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_undef_input_test15:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test15:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test15:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test15:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
+  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
+  ret <4 x float> %2
+}
+
+
+; Verify that shuffles are canonicalized according to rules:
+;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
+;
+; This allows to trigger the following combine rule:
+;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
+;
+; As a result, all the shuffle pairs in each function below should be
+; combined into a single legal shuffle operation.
+
+define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
+; ALL-LABEL: combine_undef_input_test16:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
+  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
+; SSE2-LABEL: combine_undef_input_test17:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test17:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test17:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test17:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
+; SSE2-LABEL: combine_undef_input_test18:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test18:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test18:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test18:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
+; SSE-LABEL: combine_undef_input_test19:
+; SSE:       # BB#0:
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test19:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
+; ALL-LABEL: combine_undef_input_test20:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
+  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
+  ret <4 x float> %2
+}
+
+; These tests are designed to test the ability to combine away unnecessary
+; operations feeding into a shuffle. The AVX cases are the important ones as
+; they leverage operations which cannot be done naturally on the entire vector
+; and thus are decomposed into multiple smaller operations.
+
+define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
+; SSE-LABEL: combine_unneeded_subvector1:
+; SSE:       # BB#0:
+; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_unneeded_subvector1:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_unneeded_subvector1:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %c
+}
+
+define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: combine_unneeded_subvector2:
+; SSE:       # BB#0:
+; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_unneeded_subvector2:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_unneeded_subvector2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    retq
+  %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
+  ret <8 x i32> %d
+}
+
+define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
+; SSE41-LABEL: combine_insertps1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_insertps1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
+; AVX-NEXT:    retq
+
+  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
+  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
+  ret <4 x float> %d
+}
+
+define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
+; SSE41-LABEL: combine_insertps2:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_insertps2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
+; AVX-NEXT:    retq
+
+  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
+  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
+  ret <4 x float> %d
+}
+
+define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
+; SSE41-LABEL: combine_insertps3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_insertps3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT:    retq
+
+  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
+  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
+  ret <4 x float> %d
+}
+
+define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
+; SSE41-LABEL: combine_insertps4:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_insertps4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT:    retq
+
+  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
+  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
+  ret <4 x float> %d
+}
diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll
new file mode 100644
index 0000000..226deb0
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-sse1.ll
@@ -0,0 +1,235 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=-sse2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=SSE1
+
+target triple = "x86_64-unknown-unknown"
+
+define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0001:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0020:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0300:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_1000:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_2200:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_3330:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_3210:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0011:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_2233:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0022:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_1133:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_4zzz:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    movss %xmm0, %xmm1
+; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_z4zz:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_zz4z:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_zuu4:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_zzz7:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_z6zz:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
+; SSE1-LABEL: insert_reg_and_zero_v4f32:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    movss %xmm0, %xmm1
+; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    retq
+  %v = insertelement <4 x float> undef, float %a, i32 0
+  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
+; SSE1-LABEL: insert_mem_and_zero_v4f32:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    movss (%rdi), %xmm0
+; SSE1-NEXT:    retq
+  %a = load float* %ptr
+  %v = insertelement <4 x float> undef, float %a, i32 0
+  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE1-LABEL: insert_mem_lo_v4f32:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    movq (%rdi), %rax
+; SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%rsp)
+; SSE1-NEXT:    shrq $32, %rax
+; SSE1-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; SSE1-NEXT:    movss {{[-0-9]+}}(%rsp), %xmm1
+; SSE1-NEXT:    movss {{[-0-9]+}}(%rsp), %xmm2
+; SSE1-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE1-NEXT:    xorps %xmm2, %xmm2
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1]
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    retq
+  %a = load <2 x float>* %ptr
+  %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE1-LABEL: insert_mem_hi_v4f32:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    movq (%rdi), %rax
+; SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%rsp)
+; SSE1-NEXT:    shrq $32, %rax
+; SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%rsp)
+; SSE1-NEXT:    movss {{[-0-9]+}}(%rsp), %xmm1
+; SSE1-NEXT:    movss {{[-0-9]+}}(%rsp), %xmm2
+; SSE1-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE1-NEXT:    xorps %xmm2, %xmm2
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1]
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE1-NEXT:    retq
+  %a = load <2 x float>* %ptr
+  %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
+; SSE1-LABEL: shuffle_mem_v4f32_3210:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    movaps (%rdi), %xmm0
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE1-NEXT:    retq
+  %a = load <4 x float>* %ptr
+  %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
new file mode 100644
index 0000000..afd7a24
--- /dev/null
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -0,0 +1,206 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i16_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i16_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i16_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxwd %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_8i16_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_8i16_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxwd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %B = zext <8 x i16> %A to <8 x i32>
+  ret <8 x i32>%B
+}
+
+define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_4i32_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_4i32_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_4i32_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxdq %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_4i32_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpmovzxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_4i32_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %B = zext <4 x i32> %A to <4 x i64>
+  ret <4 x i64>%B
+}
+
+define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
+; SSE2-LABEL: zext_8i8_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i8_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i8_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxwd %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_8i8_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovzxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_8i8_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %t = zext <8 x i8> %z to <8 x i32>
+  ret <8 x i32> %t
+}
+
+; PR17654
+define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
+; SSE2-LABEL: zext_16i8_to_16i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_16i8_to_16i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_16i8_to_16i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxbw %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_16i8_to_16i16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_16i8_to_16i16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxbw %xmm0, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %t = zext <16 x i8> %z to <16 x i16>
+  ret <16 x i16> %t
+}
diff --git a/test/CodeGen/X86/vectorcall.ll b/test/CodeGen/X86/vectorcall.ll
new file mode 100644
index 0000000..1e52654
--- /dev/null
+++ b/test/CodeGen/X86/vectorcall.ll
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=i686-pc-win32 -mattr=+sse2 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+; RUN: llc -mtriple=x86_64-pc-win32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+
+; Test integer arguments.
+
+define x86_vectorcallcc i32 @test_int_1() {
+  ret i32 0
+}
+
+; CHECK-LABEL: {{^}}test_int_1@@0:
+; CHECK: xorl %eax, %eax
+
+define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) {
+  ret i32 %a
+}
+
+; X86-LABEL: {{^}}test_int_2@@4:
+; X64-LABEL: {{^}}test_int_2@@8:
+; CHECK: movl %ecx, %eax
+
+define x86_vectorcallcc i32 @test_int_3(i64 inreg %a) {
+  %at = trunc i64 %a to i32
+  ret i32 %at
+}
+
+; X86-LABEL: {{^}}test_int_3@@8:
+; X64-LABEL: {{^}}test_int_3@@8:
+; CHECK: movl %ecx, %eax
+
+define x86_vectorcallcc i32 @test_int_4(i32 inreg %a, i32 inreg %b) {
+  %s = add i32 %a, %b
+  ret i32 %s
+}
+
+; X86-LABEL: {{^}}test_int_4@@8:
+; X86: leal (%ecx,%edx), %eax
+
+; X64-LABEL: {{^}}test_int_4@@16:
+; X64: leal (%rcx,%rdx), %eax
+
+define x86_vectorcallcc i32 @"\01test_int_5"(i32, i32) {
+  ret i32 0
+}
+; CHECK-LABEL: {{^}}test_int_5:
+
+define x86_vectorcallcc double @test_fp_1(double %a, double %b) {
+  ret double %b
+}
+; CHECK-LABEL: {{^}}test_fp_1@@16:
+; CHECK: movaps %xmm1, %xmm0
+
+define x86_vectorcallcc double @test_fp_2(
+    double, double, double, double, double, double, double %r) {
+  ret double %r
+}
+; CHECK-LABEL: {{^}}test_fp_2@@56:
+; CHECK: movsd {{[0-9]+\(%[re]sp\)}}, %xmm0
+
+define x86_vectorcallcc {double, double, double, double} @test_fp_3() {
+  ret {double, double, double, double}
+        { double 0.0, double 0.0, double 0.0, double 0.0 }
+}
+; CHECK-LABEL: {{^}}test_fp_3@@0:
+; CHECK: xorps %xmm0
+; CHECK: xorps %xmm1
+; CHECK: xorps %xmm2
+; CHECK: xorps %xmm3
+
+; FIXME: Returning via x87 isn't compatible, but its hard to structure the
+; tablegen any other way.
+define x86_vectorcallcc {double, double, double, double, double} @test_fp_4() {
+  ret {double, double, double, double, double}
+        { double 0.0, double 0.0, double 0.0, double 0.0, double 0.0 }
+}
+; CHECK-LABEL: {{^}}test_fp_4@@0:
+; CHECK: fldz
+; CHECK: xorps %xmm0
+; CHECK: xorps %xmm1
+; CHECK: xorps %xmm2
+; CHECK: xorps %xmm3
+
+define x86_vectorcallcc <16 x i8> @test_vec_1(<16 x i8> %a, <16 x i8> %b) {
+  ret <16 x i8> %b
+}
+; CHECK-LABEL: {{^}}test_vec_1@@32:
+; CHECK: movaps %xmm1, %xmm0
+
+define x86_vectorcallcc <16 x i8> @test_vec_2(
+    double, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> %r) {
+  ret <16 x i8> %r
+}
+; CHECK-LABEL: {{^}}test_vec_2@@104:
+; CHECK: movaps (%{{[re]}}cx), %xmm0
diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll
new file mode 100644
index 0000000..0c0f4bb
--- /dev/null
+++ b/test/CodeGen/X86/vselect-avx.ll
@@ -0,0 +1,85 @@
+; RUN: llc %s -o - -mattr=+avx | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; For this test we used to optimize the <i1 true, i1 false, i1 false, i1 true>
+; mask into <i32 2147483648, i32 0, i32 0, i32 2147483648> because we thought
+; we would lower that into a blend where only the high bit is relevant.
+; However, since the whole mask is constant, this is simplified incorrectly
+; by the generic code, because it was expecting -1 in place of 2147483648.
+; 
+; The problem does not occur without AVX, because vselect of v4i32 is not legal
+; nor custom.
+;
+; <rdar://problem/18675020>
+
+; CHECK-LABEL: test:
+; CHECK: vmovdqa {{.*#+}} xmm0 = [65535,0,0,65535]
+; CHECK: vmovdqa {{.*#+}} xmm2 = [65533,124,125,14807]
+; CHECK: ret
+define void @test(<4 x i16>* %a, <4 x i16>* %b) {
+body:
+  %predphi = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -3, i16 545, i16 4385, i16 14807>, <4 x i16> <i16 123, i16 124, i16 125, i16 127>
+  %predphi42 = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer
+  store <4 x i16> %predphi, <4 x i16>* %a, align 8
+  store <4 x i16> %predphi42, <4 x i16>* %b, align 8
+  ret void
+}
+
+; Improve code coverage.
+;
+; When shrinking the condition used into the select to match a blend, this
+; test case exercises the path where the modified node is not the root
+; of the condition.
+;
+; CHECK-LABEL: test2:
+; CHECK:	vpslld	$31, %xmm0, %xmm0
+; CHECK-NEXT:	vpmovsxdq	%xmm0, %xmm1
+; CHECK-NEXT:	vpshufd	$78, %xmm0, %xmm0       ## xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:	vpmovsxdq	%xmm0, %xmm0
+; CHECK-NEXT:	vinsertf128	$1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]]
+; CHECK: vblendvpd	[[MASK]]
+; CHECK: retq
+define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
+bb:
+  %arrayidx1928 = getelementptr inbounds double** %call1559, i64 %indvars.iv4198
+  %tmp1888 = load double** %arrayidx1928, align 8
+  %predphi.v.v = select <4 x i1> %tmp1895, <4 x double> <double -5.000000e-01, double -5.000000e-01, double -5.000000e-01, double -5.000000e-01>, <4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+  %tmp1900 = bitcast double* %tmp1888 to <4 x double>*
+  store <4 x double> %predphi.v.v, <4 x double>* %tmp1900, align 8
+  ret void
+}
+
+; For this test, we used to optimized the conditional mask for the blend, i.e.,
+; we shrunk some of its bits.
+; However, this same mask was used in another select (%predphi31) that turned out
+; to be optimized into a and. In that case, the conditional mask was wrong.
+;
+; Make sure that the and is fed by the original mask.
+; 
+; <rdar://problem/18819506>
+
+; Note: For now, hard code ORIG_MASK and SHRUNK_MASK registers, because we
+; cannot express that ORIG_MASK must not be equal to ORIG_MASK. Otherwise,
+; even a faulty pattern would pass!
+;  
+; CHECK-LABEL: test3:
+; Compute the original mask.
+;	CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[ORIG_MASK:%xmm0]]
+; Shrink the bit of the mask.
+; CHECK-NEXT: vpslld	$31, [[ORIG_MASK]], [[SHRUNK_MASK:%xmm3]]
+; Use the shrunk mask in the blend.
+; CHECK-NEXT:	vblendvps	[[SHRUNK_MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+; Use the original mask in the and.
+; CHECK-NEXT: vpand LCPI2_2(%rip), [[ORIG_MASK]], {{%xmm[0-9]+}} 
+; CHECK: retq
+define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,  <4 x i16> %tmp3, <4 x i16> %tmp12) {
+  %tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
+  %tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer
+  %predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12
+  %predphi31 = select <4 x i1> %tmp7, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer
+
+  store <4 x i16> %predphi31, <4 x i16>* %tmp16, align 8
+  store <4 x i16> %predphi, <4 x i16>* %tmp17, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
index 42cf06a..3bd1dc4 100644
--- a/test/CodeGen/X86/vselect.ll
+++ b/test/CodeGen/X86/vselect.ll
@@ -3,270 +3,253 @@
 ; Verify that we don't emit packed vector shifts instructions if the
 ; condition used by the vector select is a vector of constants.
 
-
 define <4 x float> @test1(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test1
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
 
 define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test2
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
 
 define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test3
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
 
 define <4 x float> @test4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test4
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: movaps  %xmm1, %xmm0
-; CHECK: ret
-
 
 define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test5
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
 
 define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [0,65535,0,65535,0,65535,0,65535]
+; CHECK-NEXT:    andps %xmm0, %xmm1
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> %a, <8 x i16> %a
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test6
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
 
 define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test7
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
 
 define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test8
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
 
 define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test9
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: movaps  %xmm1, %xmm0
-; CHECK-NEXT: ret
 
 define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test10
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
 
 define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps {{.*#+}} xmm2 = <0,65535,65535,0,u,65535,65535,u>
+; CHECK-NEXT:    andps %xmm2, %xmm0
+; CHECK-NEXT:    andnps %xmm1, %xmm2
+; CHECK-NEXT:    orps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test11
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
 
 define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 false, i1 false, i1 undef, i1 false, i1 false, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test12
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
 
 define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test13
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
 
 ; Fold (vselect (build_vector AllOnes), N1, N2) -> N1
-
 define <4 x float> @test14(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test14:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 undef, i1 true, i1 undef>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test14
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: pcmpeq
-; CHECK: ret
 
 define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 undef, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test15
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: pcmpeq
-; CHECK: ret
 
 ; Fold (vselect (build_vector AllZeros), N1, N2) -> N2
-
 define <4 x float> @test16(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 undef, i1 false, i1 undef>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
-} 
-; CHECK-LABEL: test16
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: ret 
+}
 
 define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test17
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: ret
 
 define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test18:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test18
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK: ret
 
 define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test19:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test19
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK: ret
 
 define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test20:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b
   ret <2 x double> %1
 }
-; CHECK-LABEL: test20
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
 
 define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test21:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <2 x i1> <i1 false, i1 true>, <2 x i64> %a, <2 x i64> %b
   ret <2 x i64> %1
 }
-; CHECK-LABEL: test21
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
 
 define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test22:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test22
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK: ret
 
 define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test23:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test23
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK: ret
 
 define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test24:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
   ret <2 x double> %1
 }
-; CHECK-LABEL: test24
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
 
 define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test25:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a, <2 x i64> %b
   ret <2 x i64> %1
 }
-; CHECK-LABEL: test25
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
 
 define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) {
-; CHECK-LABEL: select_of_shuffles_0
-; CHECK-DAG: movlhps %xmm2, [[REGA:%xmm[0-9]+]]
-; CHECK-DAG: movlhps %xmm3, [[REGB:%xmm[0-9]+]]
-; CHECK: subps [[REGB]], [[REGA]]
+; CHECK-LABEL: select_of_shuffles_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; CHECK-NEXT:    subps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = shufflevector <2 x float> %a0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %2 = shufflevector <2 x float> %a1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
   %3 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %2, <4 x float> %1
@@ -276,3 +259,24 @@ define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x
   %7 = fsub <4 x float> %3, %6
   ret <4 x float> %7
 }
+
+; PR20677
+define <16 x double> @select_illegal(<16 x double> %a, <16 x double> %b) {
+; CHECK-LABEL: select_illegal:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
+; CHECK-NEXT:    movaps %xmm7, 112(%rdi)
+; CHECK-NEXT:    movaps %xmm6, 96(%rdi)
+; CHECK-NEXT:    movaps %xmm5, 80(%rdi)
+; CHECK-NEXT:    movaps %xmm4, 64(%rdi)
+; CHECK-NEXT:    movaps %xmm3, 48(%rdi)
+; CHECK-NEXT:    movaps %xmm2, 32(%rdi)
+; CHECK-NEXT:    movaps %xmm1, 16(%rdi)
+; CHECK-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+  %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b
+  ret <16 x double> %sel
+}
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index d115929..e0b861f 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: movl
-; CHECK: paddd
+; CHECK: paddw
 ; CHECK: movlpd
 
 ; Scheduler causes produce a different instruction order
 ; ATOM: movl
-; ATOM: paddd
+; ATOM: paddw
 ; ATOM: movlpd
 
 ; bitcast a v4i16 to v2i32
diff --git a/test/CodeGen/X86/widen_conv-1.ll b/test/CodeGen/X86/widen_conv-1.ll
index 9f6778c..3f54ab6 100644
--- a/test/CodeGen/X86/widen_conv-1.ll
+++ b/test/CodeGen/X86/widen_conv-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: paddq
+; CHECK: paddd
 
 ; truncate v2i64 to v2i32
 
diff --git a/test/CodeGen/X86/widen_conversions.ll b/test/CodeGen/X86/widen_conversions.ll
index 522ab47..8e5174f 100644
--- a/test/CodeGen/X86/widen_conversions.ll
+++ b/test/CodeGen/X86/widen_conversions.ll
@@ -9,7 +9,7 @@ define <4 x i32> @zext_v4i8_to_v4i32(<4 x i8>* %ptr) {
 ; CHECK:      movd (%{{.*}}), %[[X:xmm[0-9]+]]
 ; CHECK-NEXT: pxor %[[Z:xmm[0-9]+]], %[[Z]]
 ; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
-; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
+; CHECK-NEXT: punpcklwd %[[Z]], %[[X]]
 ; CHECK-NEXT: ret
 
   %val = load <4 x i8>* %ptr
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 41bea85..0ec3574 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -4,12 +4,12 @@
 ;
 
 %i32vec3 = type <3 x i32>
-; CHECK: add3i32
 define void @add3i32(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
-; CHECK: movdqa
-; CHECK: paddd
-; CHECK: pextrd
-; CHECK: movq
+; CHECK-LABEL: add3i32:
+; CHECK:         movdqa  (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    paddd   (%{{.*}}), %[[R0]]
+; CHECK-NEXT:    pextrd  $2, %[[R0]], 8(%{{.*}})
+; CHECK-NEXT:    movq    %[[R0]], (%{{.*}})
 	%a = load %i32vec3* %ap, align 16
 	%b = load %i32vec3* %bp, align 16
 	%x = add %i32vec3 %a, %b
@@ -17,15 +17,15 @@ define void @add3i32(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
 	ret void
 }
 
-; CHECK: add3i32_2
 define void @add3i32_2(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
-; CHECK: movq
-; CHECK: pinsrd
-; CHECK: movq
-; CHECK: pinsrd
-; CHECK: paddd
-; CHECK: pextrd
-; CHECK: movq
+; CHECK-LABEL: add3i32_2:
+; CHECK:         movq    (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    pinsrd  $2, 8(%{{.*}}), %[[R0]]
+; CHECK-NEXT:    movq    (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    pinsrd  $2, 8(%{{.*}}), %[[R1]]
+; CHECK-NEXT:    paddd   %[[R0]], %[[R1]]
+; CHECK-NEXT:    pextrd  $2, %[[R1]], 8(%{{.*}})
+; CHECK-NEXT:    movq    %[[R1]], (%{{.*}})
 	%a = load %i32vec3* %ap, align 8
 	%b = load %i32vec3* %bp, align 8
 	%x = add %i32vec3 %a, %b
@@ -34,15 +34,15 @@ define void @add3i32_2(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
 }
 
 %i32vec7 = type <7 x i32>
-; CHECK: add7i32
 define void @add7i32(%i32vec7*  sret %ret, %i32vec7* %ap, %i32vec7* %bp)  {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddd
-; CHECK: paddd
-; CHECK: pextrd
-; CHECK: movq
-; CHECK: movdqa
+; CHECK-LABEL: add7i32:
+; CHECK:         movdqa  (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddd   (%{{.*}}), %[[R0]]
+; CHECK-NEXT:    paddd   16(%{{.*}}), %[[R1]]
+; CHECK-NEXT:    pextrd  $2, %[[R1]], 24(%{{.*}})
+; CHECK-NEXT:    movq    %[[R1]], 16(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R0]], (%{{.*}})
 	%a = load %i32vec7* %ap, align 16
 	%b = load %i32vec7* %bp, align 16
 	%x = add %i32vec7 %a, %b
@@ -50,18 +50,18 @@ define void @add7i32(%i32vec7*  sret %ret, %i32vec7* %ap, %i32vec7* %bp)  {
 	ret void
 }
 
-; CHECK: add12i32
 %i32vec12 = type <12 x i32>
 define void @add12i32(%i32vec12*  sret %ret, %i32vec12* %ap, %i32vec12* %bp)  {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddd
-; CHECK: paddd
-; CHECK: paddd
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: movdqa
+; CHECK-LABEL: add12i32:
+; CHECK:         movdqa  (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  32(%{{.*}}), %[[R2:xmm[0-9]+]]
+; CHECK-NEXT:    paddd   (%{{.*}}), %[[R0]]
+; CHECK-NEXT:    paddd   16(%{{.*}}), %[[R1]]
+; CHECK-NEXT:    paddd   32(%{{.*}}), %[[R2]]
+; CHECK-NEXT:    movdqa  %[[R2]], 32(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R1]], 16(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R0]], (%{{.*}})
 	%a = load %i32vec12* %ap, align 16
 	%b = load %i32vec12* %bp, align 16
 	%x = add %i32vec12 %a, %b
@@ -70,11 +70,17 @@ define void @add12i32(%i32vec12*  sret %ret, %i32vec12* %ap, %i32vec12* %bp)  {
 }
 
 
-; CHECK: add3i16
 %i16vec3 = type <3 x i16>
 define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
-; CHECK: paddd
-; CHECK: ret
+; CHECK-LABEL: add3i16:
+; CHECK:         pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddd    %[[R0]], %[[R1]]
+; CHECK-NEXT:    movdqa   %[[R1]], %[[R0]]
+; CHECK-NEXT:    pshufb   {{.*}}, %[[R0]]
+; CHECK-NEXT:    pmovzxdq %[[R0]], %[[R0]]
+; CHECK-NEXT:    pextrw   $4, %[[R1]], 4(%{{.*}})
+; CHECK-NEXT:    movd     %[[R0]], (%{{.*}})
 	%a = load %i16vec3* %ap, align 16
 	%b = load %i16vec3* %bp, align 16
 	%x = add %i16vec3 %a, %b
@@ -82,11 +88,13 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
 	ret void
 }
 
-; CHECK: add4i16
 %i16vec4 = type <4 x i16>
 define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
-; CHECK: paddd
-; CHECK: movq
+; CHECK-LABEL: add4i16:
+; CHECK:         movq    (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movq    (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddw   %[[R0]], %[[R1]]
+; CHECK-NEXT:    movq    %[[R1]], (%{{.*}})
 	%a = load %i16vec4* %ap, align 16
 	%b = load %i16vec4* %bp, align 16
 	%x = add %i16vec4 %a, %b
@@ -94,15 +102,15 @@ define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp
 	ret void
 }
 
-; CHECK: add12i16
 %i16vec12 = type <12 x i16>
 define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddw
-; CHECK: paddw
-; CHECK: movq
-; CHECK: movdqa
+; CHECK-LABEL: add12i16:
+; CHECK:         movdqa  (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddw   (%{{.*}}), %[[R0]]
+; CHECK-NEXT:    paddw   16(%{{.*}}), %[[R1]]
+; CHECK-NEXT:    movq    %[[R1]], 16(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R0]], (%{{.*}})
 	%a = load %i16vec12* %ap, align 16
 	%b = load %i16vec12* %bp, align 16
 	%x = add %i16vec12 %a, %b
@@ -110,18 +118,18 @@ define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12*
 	ret void
 }
 
-; CHECK: add18i16
 %i16vec18 = type <18 x i16>
 define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddw
-; CHECK: paddw
-; CHECK: paddw
-; CHECK: movd
-; CHECK: movdqa
-; CHECK: movdqa
+; CHECK-LABEL: add18i16:
+; CHECK:         movdqa  (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  32(%{{.*}}), %[[R2:xmm[0-9]+]]
+; CHECK-NEXT:    paddw   (%{{.*}}), %[[R0]]
+; CHECK-NEXT:    paddw   16(%{{.*}}), %[[R1]]
+; CHECK-NEXT:    paddw   32(%{{.*}}), %[[R2]]
+; CHECK-NEXT:    movd    %[[R2]], 32(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R1]], 16(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R0]], (%{{.*}})
 	%a = load %i16vec18* %ap, align 16
 	%b = load %i16vec18* %bp, align 16
 	%x = add %i16vec18 %a, %b
@@ -130,11 +138,18 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18*
 }
 
 
-; CHECK: add3i8
 %i8vec3 = type <3 x i8>
 define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
-; CHECK: paddd
-; CHECK: ret
+; CHECK-LABEL: add3i8:
+; CHECK:         pmovzxbd (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    pmovzxbd (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddd    %[[R0]], %[[R1]]
+; CHECK-NEXT:    movdqa   %[[R1]], %[[R0]]
+; CHECK-NEXT:    pshufb   {{.*}}, %[[R0]]
+; CHECK-NEXT:    pmovzxwq %[[R0]], %[[R0]]
+; CHECK-NEXT:    pextrb   $8, %[[R1]], 2(%{{.*}})
+; CHECK-NEXT:    movd     %[[R0]], %e[[R2:[abcd]]]x
+; CHECK-NEXT:    movw     %[[R2]]x, (%{{.*}})
 	%a = load %i8vec3* %ap, align 16
 	%b = load %i8vec3* %bp, align 16
 	%x = add %i8vec3 %a, %b
@@ -142,17 +157,18 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) no
 	ret void
 }
 
-; CHECK-LABEL: add31i8:
 %i8vec31 = type <31 x i8>
 define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddb
-; CHECK: paddb
-; CHECK: pextrb
-; CHECK: pextrw
-; CHECK: movq
-; CHECK: ret
+; CHECK-LABEL: add31i8:
+; CHECK:         movdqa  (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddb   (%{{.*}}), %[[R0]]
+; CHECK-NEXT:    paddb   16(%{{.*}}), %[[R1]]
+; CHECK-NEXT:    pextrb  $14, %[[R1]], 30(%{{.*}})
+; CHECK-NEXT:    pextrw  $6, %[[R1]], 28(%{{.*}})
+; CHECK-NEXT:    pextrd  $2, %[[R1]], 24(%{{.*}})
+; CHECK-NEXT:    movq    %[[R1]], 16(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R0]], (%{{.*}})
 	%a = load %i8vec31* %ap, align 16
 	%b = load %i8vec31* %bp, align 16
 	%x = add %i8vec31 %a, %b
@@ -161,14 +177,43 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
 }
 
 
-; CHECK: rot
 %i8vec3pack = type { <3 x i8>, i8 }
-define %i8vec3pack  @rot() nounwind {
-; CHECK: pmovzxbd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
+define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
+; CHECK-LABEL: rot:
+; CHECK:         movdqa  {{.*}}, %[[CONSTANT0:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  {{.*}}, %[[SHUFFLE_MASK:xmm[0-9]+]]
+; CHECK-NEXT:    pshufb  %[[SHUFFLE_MASK]], %[[CONSTANT0]]
+; CHECK-NEXT:    pmovzxwq %[[CONSTANT0]], %[[CONSTANT0]]
+; CHECK-NEXT:    movd    %[[CONSTANT0]], %e[[R0:[abcd]]]x
+; CHECK-NEXT:    movw    %[[R0]]x, (%[[PTR0:.*]])
+; CHECK-NEXT:    movb    $-98, 2(%[[PTR0]])
+; CHECK-NEXT:    movdqa  {{.*}}, %[[CONSTANT1:xmm[0-9]+]]
+; CHECK-NEXT:    pshufb  %[[SHUFFLE_MASK]], %[[CONSTANT1]]
+; CHECK-NEXT:    pmovzxwq %[[CONSTANT1]], %[[CONSTANT1]]
+; CHECK-NEXT:    movd    %[[CONSTANT1]], %e[[R1:[abcd]]]x
+; CHECK-NEXT:    movw    %[[R1]]x, (%[[PTR1:.*]])
+; CHECK-NEXT:    movb    $1, 2(%[[PTR1]])
+; CHECK-NEXT:    pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]]
+; CHECK-NEXT:    pand    {{.*}}, %[[X0]]
+; CHECK-NEXT:    pextrd  $1, %[[X0]], %e[[R0:[abcd]]]x
+; CHECK-NEXT:    shrl    %e[[R0]]x
+; CHECK-NEXT:    movd    %[[X0]], %e[[R1:[abcd]]]x
+; CHECK-NEXT:    shrl    %e[[R1]]x
+; CHECK-NEXT:    movd    %e[[R1]]x, %[[X1:xmm[0-9]+]]
+; CHECK-NEXT:    pinsrd  $1, %e[[R0]]x, %[[X1]]
+; CHECK-NEXT:    pextrd  $2, %[[X0]], %e[[R0:[abcd]]]x
+; CHECK-NEXT:    shrl    %e[[R0]]x
+; CHECK-NEXT:    pinsrd  $2, %e[[R0]]x, %[[X1]]
+; CHECK-NEXT:    pextrd  $3, %[[X0]], %e[[R0:[abcd]]]x
+; CHECK-NEXT:    pinsrd  $3, %e[[R0]]x, %[[X1]]
+; CHECK-NEXT:    movdqa  %[[X1]], %[[X2:xmm[0-9]+]]
+; CHECK-NEXT:    pshufb  %[[SHUFFLE_MASK]], %[[X2]]
+; CHECK-NEXT:    pmovzxwq %[[X2]], %[[X3:xmm[0-9]+]]
+; CHECK-NEXT:    pextrb  $8, %[[X1]], 2(%{{.*}})
+; CHECK-NEXT:    movd    %[[X3]], %e[[R0:[abcd]]]x
+; CHECK-NEXT:    movw    %[[R0]]x, (%{{.*}})
+
 entry:
-  %X = alloca %i8vec3pack, align 4
-  %rot = alloca %i8vec3pack, align 4
-  %result = alloca %i8vec3pack, align 4
   %storetmp = bitcast %i8vec3pack* %X to <3 x i8>*
   store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp
   %storetmp1 = bitcast %i8vec3pack* %rot to <3 x i8>*
@@ -180,7 +225,6 @@ entry:
   %shr = lshr <3 x i8> %extractVec, %extractVec3
   %storetmp4 = bitcast %i8vec3pack* %result to <3 x i8>*
   store <3 x i8> %shr, <3 x i8>* %storetmp4
-  %tmp5 = load %i8vec3pack* %result
-  ret %i8vec3pack %tmp5
+  ret void
 }
 
diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index a355b75..70fdbb7 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@@ -1,43 +1,56 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 
+target triple = "x86_64-unknown-unknown"
+
 ; widening shuffle v3float and then a add
 define void @shuf(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
-entry:
 ; CHECK-LABEL: shuf:
-; CHECK: extractps
-; CHECK: extractps
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    addps %xmm1, %xmm0
+; CHECK-NEXT:    extractps $2, %xmm0, 8(%eax)
+; CHECK-NEXT:    extractps $1, %xmm0, 4(%eax)
+; CHECK-NEXT:    movss %xmm0, (%eax)
+; CHECK-NEXT:    retl
+entry:
 	%x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 1, i32 2>
 	%val = fadd <3 x float> %x, %src2
 	store <3 x float> %val, <3 x float>* %dst.addr
 	ret void
-; CHECK: ret
 }
 
 
 ; widening shuffle v3float with a different mask and then a add
 define void @shuf2(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
-entry:
 ; CHECK-LABEL: shuf2:
-; CHECK: extractps
-; CHECK: extractps
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; CHECK-NEXT:    addps %xmm1, %xmm0
+; CHECK-NEXT:    extractps $2, %xmm0, 8(%eax)
+; CHECK-NEXT:    extractps $1, %xmm0, 4(%eax)
+; CHECK-NEXT:    movss %xmm0, (%eax)
+; CHECK-NEXT:    retl
+entry:
 	%x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 4, i32 2>
 	%val = fadd <3 x float> %x, %src2
 	store <3 x float> %val, <3 x float>* %dst.addr
 	ret void
-; CHECK: ret
 }
 
 ; Example of when widening a v3float operation causes the DAG to replace a node
 ; with the operation that we are currently widening, i.e. when replacing
 ; opA with opB, the DAG will produce new operations with opA.
 define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
-entry:
 ; CHECK-LABEL: shuf3:
-; CHECK-NOT: movlhps
-; CHECK-NOT: shufps
-; CHECK: pshufd
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; CHECK-NEXT:    movaps %xmm1, (%eax)
+; CHECK-NEXT:    retl
+entry:
   %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 
+  %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
   %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %tmp3.i13 = shufflevector <4 x float> %tmp1.i.i, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> ; <<3 x float>>
   %tmp6.i14 = shufflevector <3 x float> %tmp3.i13, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -45,27 +58,35 @@ entry:
   %tmp2.i18 = shufflevector <3 x float> %tmp97.i, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
   %t5 = bitcast <4 x float> %tmp2.i18 to <4 x i32>
   %shr.i.i19 = lshr <4 x i32> %t5, <i32 19, i32 19, i32 19, i32 19>
-  %and.i.i20 = and <4 x i32> %shr.i.i19, <i32 4080, i32 4080, i32 4080, i32 4080> 
+  %and.i.i20 = and <4 x i32> %shr.i.i19, <i32 4080, i32 4080, i32 4080, i32 4080>
   %shuffle.i.i.i21 = shufflevector <4 x float> %tmp2.i18, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
   store <4 x float> %shuffle.i.i.i21, <4 x float>* %dst
   ret void
-; CHECK: ret
 }
 
 ; PR10421: make sure we correctly handle extreme widening with CONCAT_VECTORS
 define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone {
 ; CHECK-LABEL: shuf4:
-; CHECK-NOT: punpckldq
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    pshufb %xmm2, %xmm1
+; CHECK-NEXT:    pshufb %xmm2, %xmm0
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retl
   %vshuf = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %vshuf
-; CHECK: ret
 }
 
 ; PR11389: another CONCAT_VECTORS case
 define void @shuf5(<8 x i8>* %p) nounwind {
 ; CHECK-LABEL: shuf5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = <4,33,u,u,u,u,u,u>
+; CHECK-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    movlpd %xmm0, (%eax)
+; CHECK-NEXT:    retl
   %v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   store <8 x i8> %v, <8 x i8>* %p, align 8
   ret void
-; CHECK: ret
 }
diff --git a/test/CodeGen/X86/win32-pic-jumptable.ll b/test/CodeGen/X86/win32-pic-jumptable.ll
new file mode 100644
index 0000000..cabd36a
--- /dev/null
+++ b/test/CodeGen/X86/win32-pic-jumptable.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -relocation-model=pic | FileCheck %s
+
+; CHECK:        calll L0$pb
+; CHECK-NEXT: L0$pb:
+; CHECK-NEXT:   popl %eax
+; CHECK-NEXT:   addl LJTI0_0(,%ecx,4), %eax
+; CHECK-NEXT:   jmpl *%eax
+
+; CHECK:      LJTI0_0:
+; CHECK-NEXT:   .long LBB0_4-L0$pb
+; CHECK-NEXT:   .long LBB0_5-L0$pb
+; CHECK-NEXT:   .long LBB0_6-L0$pb
+; CHECK-NEXT:   .long LBB0_7-L0$pb
+
+
+target triple = "i686--windows-itanium"
+define i32 @f(i64 %x) {
+bb0:
+  switch i64 %x, label %bb5 [
+    i64 1, label %bb1
+    i64 2, label %bb2
+    i64 3, label %bb3
+    i64 4, label %bb4
+  ]
+bb1:
+  br label %bb5
+bb2:
+  br label %bb5
+bb3:
+  br label %bb5
+bb4:
+  br label %bb5
+bb5:
+  %y = phi i32 [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ]
+  ret i32 %y
+}
diff --git a/test/CodeGen/X86/win64_call_epi.ll b/test/CodeGen/X86/win64_call_epi.ll
new file mode 100644
index 0000000..bc73ad4
--- /dev/null
+++ b/test/CodeGen/X86/win64_call_epi.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=WIN64
+
+declare void @bar()
+declare void @baz()
+declare i32 @personality(...)
+
+; Check for 'nop' between the last call and the epilogue.
+define void @foo1() {
+
+    invoke void @bar()
+        to label %normal
+        unwind label %catch
+
+normal:
+    ret void
+
+catch:
+    %1 = landingpad { i8*, i32 } personality i32 (...)* @personality cleanup
+    resume { i8*, i32 } %1
+}
+; WIN64-LABEL: foo1:
+; WIN64: .seh_proc foo1
+; WIN64: callq bar
+; WIN64: nop
+; WIN64: addq ${{[0-9]+}}, %rsp
+; WIN64: retq
+; Check for 'ud2' after noreturn call
+; WIN64: callq _Unwind_Resume
+; WIN64-NEXT: ud2
+; WIN64: .seh_endproc
+
+
+; Check it still works when blocks are reordered.
+@something = global i32 0
+define void @foo2(i1 zeroext %cond ) {
+    br i1 %cond, label %a, label %b, !prof !0
+a:
+    call void @bar()
+    br label %done
+b:
+    call void @baz()
+    store i32 0, i32* @something
+    br label %done
+done:
+    ret void
+}
+!0 = metadata !{metadata !"branch_weights", i32 100, i32 0}
+; WIN64-LABEL: foo2:
+; WIN64: callq bar
+; WIN64: nop
+; WIN64: addq ${{[0-9]+}}, %rsp
+; WIN64: retq
+
+
+; Check nop is not emitted when call is not adjacent to epilogue.
+define i32 @foo3() {
+    call void @bar()
+    ret i32 0
+}
+; WIN64-LABEL: foo3:
+; WIN64: callq bar
+; WIN64: xorl
+; WIN64-NOT: nop
+; WIN64: addq ${{[0-9]+}}, %rsp
+; WIN64: retq
diff --git a/test/CodeGen/X86/win64_vararg.ll b/test/CodeGen/X86/win64_vararg.ll
index 1a51b2a..8d7f201 100644
--- a/test/CodeGen/X86/win64_vararg.ll
+++ b/test/CodeGen/X86/win64_vararg.ll
@@ -111,3 +111,22 @@ entry:
   %tmp = va_arg i8** %ap, i32
   ret i32 %tmp
 }
+
+define void @sret_arg(i32* sret %agg.result, i8* nocapture readnone %format, ...) {
+entry:
+  %ap = alloca i8*
+  %ap_i8 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap_i8)
+  %tmp = va_arg i8** %ap, i32
+  store i32 %tmp, i32* %agg.result
+  ret void
+}
+; CHECK-LABEL: sret_arg:
+; CHECK: pushq
+; CHECK-DAG: movq %r9, 40(%rsp)
+; CHECK-DAG: movq %r8, 32(%rsp)
+; CHECK: movl 32(%rsp), %[[tmp:[^ ]*]]
+; CHECK: movl %[[tmp]], (%[[sret:[^ ]*]])
+; CHECK: movq %[[sret]], %rax
+; CHECK: popq
+; CHECK: retq
diff --git a/test/CodeGen/X86/win_cst_pool.ll b/test/CodeGen/X86/win_cst_pool.ll
new file mode 100644
index 0000000..e8b853a
--- /dev/null
+++ b/test/CodeGen/X86/win_cst_pool.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define double @double() {
+  ret double 0x0000000000800000
+}
+; CHECK:              .globl  __real@0000000000800000
+; CHECK-NEXT:         .section        .rdata,"rd",discard,__real@0000000000800000
+; CHECK-NEXT:         .align  8
+; CHECK-NEXT: __real@0000000000800000:
+; CHECK-NEXT:         .quad   8388608
+; CHECK:      double:
+; CHECK:               movsd   __real@0000000000800000(%rip), %xmm0
+; CHECK-NEXT:          ret
+
+define <4 x i32> @vec1() {
+  ret <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+}
+; CHECK:              .globl  __xmm@00000000000000010000000200000003
+; CHECK-NEXT:         .section        .rdata,"rd",discard,__xmm@00000000000000010000000200000003
+; CHECK-NEXT:         .align  16
+; CHECK-NEXT: __xmm@00000000000000010000000200000003:
+; CHECK-NEXT:         .long   3
+; CHECK-NEXT:         .long   2
+; CHECK-NEXT:         .long   1
+; CHECK-NEXT:         .long   0
+; CHECK:      vec1:
+; CHECK:               movaps  __xmm@00000000000000010000000200000003(%rip), %xmm0
+; CHECK-NEXT:          ret
+
+define <8 x i16> @vec2() {
+  ret <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
+}
+; CHECK:             .globl  __xmm@00000001000200030004000500060007
+; CHECK-NEXT:        .section        .rdata,"rd",discard,__xmm@00000001000200030004000500060007
+; CHECK-NEXT:        .align  16
+; CHECK-NEXT: __xmm@00000001000200030004000500060007:
+; CHECK-NEXT:        .short  7
+; CHECK-NEXT:        .short  6
+; CHECK-NEXT:        .short  5
+; CHECK-NEXT:        .short  4
+; CHECK-NEXT:        .short  3
+; CHECK-NEXT:        .short  2
+; CHECK-NEXT:        .short  1
+; CHECK-NEXT:        .short  0
+; CHECK:      vec2:
+; CHECK:               movaps  __xmm@00000001000200030004000500060007(%rip), %xmm0
+; CHECK-NEXT:          ret
+
+
+define <4 x float> @undef1() {
+  ret <4 x float> <float 1.0, float 1.0, float undef, float undef>
+
+; CHECK:             .globl  __xmm@00000000000000003f8000003f800000
+; CHECK-NEXT:        .section        .rdata,"rd",discard,__xmm@00000000000000003f8000003f800000
+; CHECK-NEXT:        .align  16
+; CHECK-NEXT: __xmm@00000000000000003f8000003f800000:
+; CHECK-NEXT:        .long   1065353216              # float 1
+; CHECK-NEXT:        .long   1065353216              # float 1
+; CHECK-NEXT:        .zero   4
+; CHECK-NEXT:        .zero   4
+; CHECK:      undef1:
+; CHECK:               movaps  __xmm@00000000000000003f8000003f800000(%rip), %xmm0
+; CHECK-NEXT:          ret
+}
diff --git a/test/CodeGen/X86/windows-itanium-alloca.ll b/test/CodeGen/X86/windows-itanium-alloca.ll
new file mode 100644
index 0000000..0a06cde
--- /dev/null
+++ b/test/CodeGen/X86/windows-itanium-alloca.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple i686-windows-itanium -filetype asm -o - %s | FileCheck %s
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686--windows-itanium"
+
+declare void @external(i8*)
+
+define dllexport void @alloca(i32 %sz) {
+entry:
+  %vla = alloca i8, i32 %sz, align 1
+  call void @external(i8* %vla)
+  ret void
+}
+
+; CHECK: __chkstk
+
diff --git a/test/CodeGen/X86/x32-function_pointer-1.ll b/test/CodeGen/X86/x32-function_pointer-1.ll
new file mode 100644
index 0000000..2baf92a
--- /dev/null
+++ b/test/CodeGen/X86/x32-function_pointer-1.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32  | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s
+
+; Test for x32 function pointer tail call
+
+@foo1 = external global void (i8*)*
+@foo2 = external global void (i8*)*
+
+define void @bar(i8* %h) nounwind uwtable {
+entry:
+  %0 = load void (i8*)** @foo1, align 4
+; CHECK: movl	foo1(%rip), %e{{[^,]*}}
+  tail call void %0(i8* %h) nounwind
+; CHECK: callq	*%r{{[^,]*}}
+  %1 = load void (i8*)** @foo2, align 4
+; CHECK: movl	foo2(%rip), %e{{[^,]*}}
+  tail call void %1(i8* %h) nounwind
+; CHECK: jmpq	*%r{{[^,]*}}
+  ret void
+}
diff --git a/test/CodeGen/X86/x32-function_pointer-2.ll b/test/CodeGen/X86/x32-function_pointer-2.ll
new file mode 100644
index 0000000..f727d41
--- /dev/null
+++ b/test/CodeGen/X86/x32-function_pointer-2.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s
+
+; Test call function pointer with function argument
+;
+; void bar (void * h, void (*foo) (void *))
+;    {
+;      foo (h);
+;      foo (h);
+;    }
+
+
+define void @bar(i8* %h, void (i8*)* nocapture %foo) nounwind {
+entry:
+  tail call void %foo(i8* %h) nounwind
+; CHECK: mov{{l|q}}	%{{e|r}}si, %{{e|r}}[[REG:.*]]{{d?}}
+; CHECK: callq	*%r[[REG]]
+  tail call void %foo(i8* %h) nounwind
+; CHECK: jmpq	*%r{{[^,]*}}
+  ret void
+}
diff --git a/test/CodeGen/X86/x32-function_pointer-3.ll b/test/CodeGen/X86/x32-function_pointer-3.ll
new file mode 100644
index 0000000..5eaf85d
--- /dev/null
+++ b/test/CodeGen/X86/x32-function_pointer-3.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s
+
+; Test calling function pointer passed in struct
+
+;    The fuction argument `h' in
+
+;    struct foo {
+;      void (*f) (void);
+;      int i;
+;    };
+;    void
+;    bar (struct foo h)
+;    {
+;      h.f ();
+;    }
+
+;    is passed in the 64-bit %rdi register.  The `f' field is in the lower 32
+;    bits of %rdi register and the `i' field is in the upper 32 bits of %rdi
+;    register.  We need to zero-extend %edi to %rdi before branching via %rdi.
+
+define void @bar(i64 %h.coerce) nounwind {
+entry:
+  %h.sroa.0.0.extract.trunc = trunc i64 %h.coerce to i32
+  %0 = inttoptr i32 %h.sroa.0.0.extract.trunc to void ()*
+; CHECK: movl	%edi, %e[[REG:.*]]
+  tail call void %0() nounwind
+; CHECK: jmpq	*%r[[REG]]
+  ret void
+}
diff --git a/test/CodeGen/X86/x86-64-call.ll b/test/CodeGen/X86/x86-64-call.ll
new file mode 100644
index 0000000..300f8d1
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-call.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux-gnux32 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686-pc-linux -verify-machineinstrs | FileCheck %s -check-prefix=IA32
+
+; trivial test for correct call suffix
+
+define i32 @far() nounwind uwtable {
+entry:
+; CHECK: callq foo
+; IA32: calll foo
+  tail call void @foo() nounwind
+  ret i32 0
+}
+
+declare void @foo()
diff --git a/test/CodeGen/X86/x86-64-pic-10.ll b/test/CodeGen/X86/x86-64-pic-10.ll
index da8082b..8790fa6 100644
--- a/test/CodeGen/X86/x86-64-pic-10.ll
+++ b/test/CodeGen/X86/x86-64-pic-10.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
 ; RUN: grep "callq	g@PLT" %t1
 
-@g = alias weak i32 ()* @f
+@g = weak alias i32 ()* @f
 
 define void @h() {
 entry:
diff --git a/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
new file mode 100644
index 0000000..c476ffd
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
+; RUN: llc -mtriple=x86_64-pc-nacl < %s | FileCheck -check-prefix=NACL %s
+
+; x32 uses %esp, %ebp as stack and frame pointers
+
+; CHECK-LABEL: foo
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: movq %rdi, -8(%rbp)
+; CHECK: popq %rbp
+; X32ABI-LABEL: foo
+; X32ABI: pushq %rbp
+; X32ABI: movl %esp, %ebp
+; X32ABI: movl %edi, -4(%ebp)
+; X32ABI: popq %rbp
+; NACL-LABEL: foo
+; NACL: pushq %rbp
+; NACL: movq %rsp, %rbp
+; NACL: movl %edi, -4(%rbp)
+; NACL: popq %rbp
+
+
+define void @foo(i32* %a) #0 {
+entry:
+  %a.addr = alloca i32*, align 4
+  %b = alloca i32*, align 4
+  store i32* %a, i32** %a.addr, align 4
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"}
+
+
diff --git a/test/CodeGen/X86/x86-64-tls-1.ll b/test/CodeGen/X86/x86-64-tls-1.ll
index 641786f..2879fb4 100644
--- a/test/CodeGen/X86/x86-64-tls-1.ll
+++ b/test/CodeGen/X86/x86-64-tls-1.ll
@@ -1,10 +1,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 @tm_nest_level = internal thread_local global i32 0
 define i64 @z() nounwind {
-; FIXME: The codegen here is primitive at best and could be much better.
-; The add and the moves can be folded together.
-; CHECK-DAG: movq    $tm_nest_level@TPOFF, %rcx
-; CHECK-DAG: movq    %fs:0, %rax
-; CHECK: addl    %ecx, %eax
+; CHECK:      movq    $tm_nest_level@TPOFF, %r[[R0:[abcd]]]x
+; CHECK-NEXT: addl    %fs:0, %e[[R0]]x
+; CHECK-NEXT: andq    $100, %r[[R0]]x
+
   ret i64 and (i64 ptrtoint (i32* @tm_nest_level to i64), i64 100)
 }
diff --git a/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll b/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll
new file mode 100644
index 0000000..fcf7eae
--- /dev/null
+++ b/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll
@@ -0,0 +1,35 @@
+; RUN: llc  -mtriple=x86_64-apple-macosx10.9.0  -mcpu=core2 -mattr=+64bit,+sse2 < %s | FileCheck %s
+
+; DAGCombine may choose to rewrite 2 loads feeding a select as a select of
+; addresses feeding a load. This test ensures that when it does that it creates
+; a load with alignment equivalent to the most restrictive source load.
+
+declare void @sink(<2 x double>)
+
+define void @test1(i1 %cmp) align 2 {
+  %1 = alloca  <2 x double>, align 16
+  %2 = alloca  <2 x double>, align 8
+
+  %val = load <2 x double>* %1, align 16
+  %val2 = load <2 x double>* %2, align 8
+  %val3 = select i1 %cmp, <2 x double> %val, <2 x double> %val2
+  call void @sink(<2 x double> %val3)
+  ret void
+  ; CHECK: test1
+  ; CHECK: movups
+  ; CHECK: ret
+}
+
+define void @test2(i1 %cmp) align 2 {
+  %1 = alloca  <2 x double>, align 16
+  %2 = alloca  <2 x double>, align 8
+
+  %val = load <2 x double>* %1, align 16
+  %val2 = load <2 x double>* %2, align 16
+  %val3 = select i1 %cmp, <2 x double> %val, <2 x double> %val2
+  call void @sink(<2 x double> %val3)
+  ret void
+  ; CHECK: test2
+  ; CHECK: movaps
+  ; CHECK: ret
+}
diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
new file mode 100644
index 0000000..4317d8a
--- /dev/null
+++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind {
+; CHECK-LABEL: LCPI0_0:
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-LABEL: foo:
+; CHECK: cmpeqps %xmm1, %xmm0
+; CHECK-NEXT: andps LCPI0_0(%rip), %xmm0
+; CHECK-NEXT: retq
+
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %result = sitofp <4 x i32> %ext to <4 x float>
+  ret <4 x float> %result
+}
+
+; Make sure the operation doesn't try to get folded when the sizes don't match,
+; as that ends up crashing later when trying to form a bitcast operation for
+; the folded nodes.
+define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwind {
+; CHECK-LABEL: LCPI1_0:
+; CHECK-NEXT: .long 1                       ## 0x1
+; CHECK-NEXT: .long 1                       ## 0x1
+; CHECK-NEXT: .long 1                       ## 0x1
+; CHECK-NEXT: .long 1                       ## 0x1
+; CHECK-LABEL: foo1:
+;   FIXME: The operation gets scalarized. If/when the compiler learns to better
+;          use [V]CVTDQ2PD, this will need updated.
+; CHECK: cvtsi2sdq
+; CHECK: cvtsi2sdq
+; CHECK: cvtsi2sdq
+; CHECK: cvtsi2sdq
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %result = sitofp <4 x i32> %ext to <4 x double>
+  store <4 x double> %result, <4 x double>* %p
+  ret void
+}
+
+; Also test the general purpose constant folding of int->fp.
+define void @foo2(<4 x float>* noalias %result) nounwind {
+; CHECK-LABEL: LCPI2_0:
+; CHECK-NEXT: .long 1082130432              ## float 4.000000e+00
+; CHECK-NEXT: .long 1084227584              ## float 5.000000e+00
+; CHECK-NEXT: .long 1086324736              ## float 6.000000e+00
+; CHECK-NEXT: .long 1088421888              ## float 7.000000e+00
+; CHECK-LABEL: foo2:
+; CHECK:  movaps LCPI2_0(%rip), %xmm0
+
+  %val = uitofp <4 x i32> <i32 4, i32 5, i32 6, i32 7> to <4 x float>
+  store <4 x float> %val, <4 x float>* %result
+  ret void
+}
+
+; Fold explicit AND operations when the constant isn't a splat of a single
+; scalar value like what the zext creates.
+define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind {
+; CHECK-LABEL: LCPI3_0:
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 0                       ## 0x0
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 0                       ## 0x0
+; CHECK-LABEL: foo3:
+; CHECK: cmpeqps %xmm1, %xmm0
+; CHECK-NEXT: andps LCPI3_0(%rip), %xmm0
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %and = and <4 x i32> %ext, <i32 255, i32 256, i32 257, i32 258>
+  %result = sitofp <4 x i32> %and to <4 x float>
+  ret <4 x float> %result
+}
diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll
index f078631..54a4d6aa 100644
--- a/test/CodeGen/X86/xaluo.ll
+++ b/test/CodeGen/X86/xaluo.ll
@@ -1,7 +1,5 @@
-; RUN: llc -mtriple=x86_64-darwin-unknown < %s                             | FileCheck %s --check-prefix=DAG
-; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST
-; RUN: llc -mtriple=x86_64-darwin-unknown < %s                             | FileCheck %s
-; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-darwin-unknown                             < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
+; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
 
 ;
 ; Get the actual value of the overflow bit.
@@ -9,12 +7,9 @@
 ; SADDO reg, reg
 define zeroext i1 @saddo.i8(i8 signext %v1, i8 signext %v2, i8* %res) {
 entry:
-; DAG-LABEL:    saddo.i8
-; DAG:          addb %sil, %dil
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   saddo.i8
-; FAST:         addb %sil, %dil
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: saddo.i8
+; CHECK:       addb %sil, %dil
+; CHECK-NEXT:  seto %al
   %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -24,12 +19,9 @@ entry:
 
 define zeroext i1 @saddo.i16(i16 %v1, i16 %v2, i16* %res) {
 entry:
-; DAG-LABEL:    saddo.i16
-; DAG:          addw %si, %di
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   saddo.i16
-; FAST:         addw %si, %di
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: saddo.i16
+; CHECK:       addw %si, %di
+; CHECK-NEXT:  seto %al
   %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -39,12 +31,9 @@ entry:
 
 define zeroext i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; DAG-LABEL:    saddo.i32
-; DAG:          addl %esi, %edi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   saddo.i32
-; FAST:         addl %esi, %edi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: saddo.i32
+; CHECK:       addl %esi, %edi
+; CHECK-NEXT:  seto %al
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -54,12 +43,9 @@ entry:
 
 define zeroext i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
-; DAG-LABEL:    saddo.i64
-; DAG:          addq %rsi, %rdi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   saddo.i64
-; FAST:         addq %rsi, %rdi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: saddo.i64
+; CHECK:       addq %rsi, %rdi
+; CHECK-NEXT:  seto %al
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -67,16 +53,48 @@ entry:
   ret i1 %obit
 }
 
-; SADDO reg, imm | imm, reg
-; FIXME: INC isn't supported in FastISel yet
-define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) {
+; SADDO reg, 1 | INC
+define zeroext i1 @saddo.inc.i8(i8 %v1, i8* %res) {
+entry:
+; CHECK-LABEL: saddo.inc.i8
+; CHECK:       incb %dil
+; CHECK-NEXT:  seto %al
+  %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 1)
+  %val = extractvalue {i8, i1} %t, 0
+  %obit = extractvalue {i8, i1} %t, 1
+  store i8 %val, i8* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.inc.i16(i16 %v1, i16* %res) {
+entry:
+; CHECK-LABEL: saddo.inc.i16
+; CHECK:       incw %di
+; CHECK-NEXT:  seto %al
+  %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 1)
+  %val = extractvalue {i16, i1} %t, 0
+  %obit = extractvalue {i16, i1} %t, 1
+  store i16 %val, i16* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.inc.i32(i32 %v1, i32* %res) {
 entry:
-; DAG-LABEL:    saddo.i64imm1
-; DAG:          incq %rdi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   saddo.i64imm1
-; FAST:         addq $1, %rdi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: saddo.inc.i32
+; CHECK:       incl %edi
+; CHECK-NEXT:  seto %al
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 1)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.inc.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL: saddo.inc.i64
+; CHECK:       incq %rdi
+; CHECK-NEXT:  seto %al
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 1)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -84,17 +102,18 @@ entry:
   ret i1 %obit
 }
 
+; SADDO reg, imm | imm, reg
 ; FIXME: DAG doesn't optimize immediates on the LHS.
-define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) {
+define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) {
 entry:
-; DAG-LABEL:    saddo.i64imm2
-; DAG:          mov
-; DAG-NEXT:     addq
-; DAG-NEXT:     seto
-; FAST-LABEL:   saddo.i64imm2
-; FAST:         addq $1, %rdi
-; FAST-NEXT:    seto %al
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 1, i64 %v1)
+; SDAG-LABEL: saddo.i64imm1
+; SDAG:       mov
+; SDAG-NEXT:  addq
+; SDAG-NEXT:  seto
+; FAST-LABEL: saddo.i64imm1
+; FAST:       addq $2, %rdi
+; FAST-NEXT:  seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 2, i64 %v1)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
   store i64 %val, i64* %res
@@ -102,14 +121,11 @@ entry:
 }
 
 ; Check boundary conditions for large immediates.
-define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) {
+define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) {
 entry:
-; DAG-LABEL:    saddo.i64imm3
-; DAG:          addq $-2147483648, %rdi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   saddo.i64imm3
-; FAST:         addq $-2147483648, %rdi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: saddo.i64imm2
+; CHECK:       addq $-2147483648, %rdi
+; CHECK-NEXT:  seto %al
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -117,16 +133,12 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) {
+define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) {
 entry:
-; DAG-LABEL:    saddo.i64imm4
-; DAG:          movabsq $-21474836489, %[[REG:[a-z]+]]
-; DAG-NEXT:     addq %rdi, %[[REG]]
-; DAG-NEXT:     seto
-; FAST-LABEL:   saddo.i64imm4
-; FAST:         movabsq $-21474836489, %[[REG:[a-z]+]]
-; FAST-NEXT:    addq %rdi, %[[REG]]
-; FAST-NEXT:    seto
+; CHECK-LABEL: saddo.i64imm3
+; CHECK:       movabsq $-21474836489, %[[REG:[a-z]+]]
+; CHECK-NEXT:  addq %rdi, %[[REG]]
+; CHECK-NEXT:  seto
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -134,14 +146,11 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) {
+define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) {
 entry:
-; DAG-LABEL:    saddo.i64imm5
-; DAG:          addq $2147483647, %rdi
-; DAG-NEXT:     seto
-; FAST-LABEL:   saddo.i64imm5
-; FAST:         addq $2147483647, %rdi
-; FAST-NEXT:    seto
+; CHECK-LABEL: saddo.i64imm4
+; CHECK:       addq $2147483647, %rdi
+; CHECK-NEXT:  seto
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -149,17 +158,12 @@ entry:
   ret i1 %obit
 }
 
-; TODO: FastISel shouldn't use movabsq.
-define zeroext i1 @saddo.i64imm6(i64 %v1, i64* %res) {
+define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) {
 entry:
-; DAG-LABEL:    saddo.i64imm6
-; DAG:          movl $2147483648, %ecx
-; DAG:          addq %rdi, %rcx
-; DAG-NEXT:     seto
-; FAST-LABEL:   saddo.i64imm6
-; FAST:         movabsq $2147483648, %[[REG:[a-z]+]]
-; FAST:         addq %rdi, %[[REG]]
-; FAST-NEXT:     seto
+; CHECK-LABEL: saddo.i64imm5
+; CHECK:       movl $2147483648
+; CHECK:       addq %rdi
+; CHECK-NEXT:  seto
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483648)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -170,12 +174,9 @@ entry:
 ; UADDO
 define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; DAG-LABEL:    uaddo.i32
-; DAG:          addl %esi, %edi
-; DAG-NEXT:     setb %al
-; FAST-LABEL:   uaddo.i32
-; FAST:         addl %esi, %edi
-; FAST-NEXT:    setb %al
+; CHECK-LABEL: uaddo.i32
+; CHECK:       addl %esi, %edi
+; CHECK-NEXT:  setb %al
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -185,12 +186,9 @@ entry:
 
 define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
-; DAG-LABEL:    uaddo.i64
-; DAG:          addq %rsi, %rdi
-; DAG-NEXT:     setb %al
-; FAST-LABEL:   uaddo.i64
-; FAST:         addq %rsi, %rdi
-; FAST-NEXT:    setb %al
+; CHECK-LABEL: uaddo.i64
+; CHECK:       addq %rsi, %rdi
+; CHECK-NEXT:  setb %al
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -198,15 +196,57 @@ entry:
   ret i1 %obit
 }
 
+; UADDO reg, 1 | NOT INC
+define zeroext i1 @uaddo.inc.i8(i8 %v1, i8* %res) {
+entry:
+; CHECK-LABEL: uaddo.inc.i8
+; CHECK-NOT:   incb %dil
+  %t = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %v1, i8 1)
+  %val = extractvalue {i8, i1} %t, 0
+  %obit = extractvalue {i8, i1} %t, 1
+  store i8 %val, i8* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.inc.i16(i16 %v1, i16* %res) {
+entry:
+; CHECK-LABEL: uaddo.inc.i16
+; CHECK-NOT:   incw %di
+  %t = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %v1, i16 1)
+  %val = extractvalue {i16, i1} %t, 0
+  %obit = extractvalue {i16, i1} %t, 1
+  store i16 %val, i16* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.inc.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL: uaddo.inc.i32
+; CHECK-NOT:   incl %edi
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 1)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.inc.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL: uaddo.inc.i64
+; CHECK-NOT:   incq %rdi
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 1)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
 ; SSUBO
 define zeroext i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; DAG-LABEL:    ssubo.i32
-; DAG:          subl %esi, %edi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   ssubo.i32
-; FAST:         subl %esi, %edi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: ssubo.i32
+; CHECK:       subl %esi, %edi
+; CHECK-NEXT:  seto %al
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -216,12 +256,9 @@ entry:
 
 define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
-; DAG-LABEL:    ssubo.i64
-; DAG:          subq %rsi, %rdi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   ssubo.i64
-; FAST:         subq %rsi, %rdi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: ssubo.i64
+; CHECK:       subq %rsi, %rdi
+; CHECK-NEXT:  seto %al
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -232,12 +269,9 @@ entry:
 ; USUBO
 define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; DAG-LABEL:    usubo.i32
-; DAG:          subl %esi, %edi
-; DAG-NEXT:     setb %al
-; FAST-LABEL:   usubo.i32
-; FAST:         subl %esi, %edi
-; FAST-NEXT:    setb %al
+; CHECK-LABEL: usubo.i32
+; CHECK:       subl %esi, %edi
+; CHECK-NEXT:  setb %al
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -247,12 +281,9 @@ entry:
 
 define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
-; DAG-LABEL:    usubo.i64
-; DAG:          subq %rsi, %rdi
-; DAG-NEXT:     setb %al
-; FAST-LABEL:   usubo.i64
-; FAST:         subq %rsi, %rdi
-; FAST-NEXT:    setb %al
+; CHECK-LABEL: usubo.i64
+; CHECK:       subq %rsi, %rdi
+; CHECK-NEXT:  setb %al
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -263,10 +294,10 @@ entry:
 ; SMULO
 define zeroext i1 @smulo.i8(i8 %v1, i8 %v2, i8* %res) {
 entry:
-; FAST-LABEL:   smulo.i8
-; FAST:         movb %dil, %al
-; FAST-NEXT:    imulb %sil
-; FAST-NEXT:    seto %cl
+; CHECK-LABEL:   smulo.i8
+; CHECK:         movb %dil, %al
+; CHECK-NEXT:    imulb %sil
+; CHECK-NEXT:    seto %cl
   %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -276,12 +307,9 @@ entry:
 
 define zeroext i1 @smulo.i16(i16 %v1, i16 %v2, i16* %res) {
 entry:
-; DAG-LABEL:    smulo.i16
-; DAG:          imulw %si, %di
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   smulo.i16
-; FAST:         imulw %si, %di
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: smulo.i16
+; CHECK:       imulw %si, %di
+; CHECK-NEXT:  seto %al
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -291,12 +319,9 @@ entry:
 
 define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; DAG-LABEL:    smulo.i32
-; DAG:          imull %esi, %edi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   smulo.i32
-; FAST:         imull %esi, %edi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: smulo.i32
+; CHECK:       imull %esi, %edi
+; CHECK-NEXT:  seto %al
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -306,12 +331,9 @@ entry:
 
 define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
-; DAG-LABEL:    smulo.i64
-; DAG:          imulq %rsi, %rdi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   smulo.i64
-; FAST:         imulq %rsi, %rdi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: smulo.i64
+; CHECK:       imulq %rsi, %rdi
+; CHECK-NEXT:  seto %al
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -322,10 +344,10 @@ entry:
 ; UMULO
 define zeroext i1 @umulo.i8(i8 %v1, i8 %v2, i8* %res) {
 entry:
-; FAST-LABEL:   umulo.i8
-; FAST:         movb %dil, %al
-; FAST-NEXT:    mulb %sil
-; FAST-NEXT:    seto %cl
+; CHECK-LABEL:   umulo.i8
+; CHECK:         movb %dil, %al
+; CHECK-NEXT:    mulb %sil
+; CHECK-NEXT:    seto %cl
   %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -335,12 +357,9 @@ entry:
 
 define zeroext i1 @umulo.i16(i16 %v1, i16 %v2, i16* %res) {
 entry:
-; DAG-LABEL:    umulo.i16
-; DAG:          mulw %si
-; DAG-NEXT:     seto
-; FAST-LABEL:   umulo.i16
-; FAST:         mulw %si
-; FAST-NEXT:    seto
+; CHECK-LABEL: umulo.i16
+; CHECK:       mulw %si
+; CHECK-NEXT:  seto
   %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -350,12 +369,9 @@ entry:
 
 define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; DAG-LABEL:    umulo.i32
-; DAG:          mull %esi
-; DAG-NEXT:     seto
-; FAST-LABEL:   umulo.i32
-; FAST:         mull %esi
-; FAST-NEXT:    seto
+; CHECK-LABEL: umulo.i32
+; CHECK:       mull %esi
+; CHECK-NEXT:  seto
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -365,12 +381,9 @@ entry:
 
 define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
-; DAG-LABEL:    umulo.i64
-; DAG:          mulq %rsi
-; DAG-NEXT:     seto
-; FAST-LABEL:   umulo.i64
-; FAST:         mulq %rsi
-; FAST-NEXT:    seto
+; CHECK-LABEL: umulo.i64
+; CHECK:       mulq %rsi
+; CHECK-NEXT:  seto
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -383,9 +396,9 @@ entry:
 ;
 define i32 @saddo.select.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    saddo.select.i32
-; CHECK:          addl   %esi, %eax
-; CHECK-NEXT:     cmovol %edi, %esi
+; CHECK-LABEL: saddo.select.i32
+; CHECK:       addl   %esi, %eax
+; CHECK-NEXT:  cmovol %edi, %esi
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -394,9 +407,9 @@ entry:
 
 define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    saddo.select.i64
-; CHECK:          addq   %rsi, %rax
-; CHECK-NEXT:     cmovoq %rdi, %rsi
+; CHECK-LABEL: saddo.select.i64
+; CHECK:       addq   %rsi, %rax
+; CHECK-NEXT:  cmovoq %rdi, %rsi
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -405,9 +418,9 @@ entry:
 
 define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    uaddo.select.i32
-; CHECK:          addl   %esi, %eax
-; CHECK-NEXT:     cmovbl %edi, %esi
+; CHECK-LABEL: uaddo.select.i32
+; CHECK:       addl   %esi, %eax
+; CHECK-NEXT:  cmovbl %edi, %esi
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -416,9 +429,9 @@ entry:
 
 define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    uaddo.select.i64
-; CHECK:          addq   %rsi, %rax
-; CHECK-NEXT:     cmovbq %rdi, %rsi
+; CHECK-LABEL: uaddo.select.i64
+; CHECK:       addq   %rsi, %rax
+; CHECK-NEXT:  cmovbq %rdi, %rsi
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -427,9 +440,9 @@ entry:
 
 define i32 @ssubo.select.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    ssubo.select.i32
-; CHECK:          cmpl   %esi, %edi
-; CHECK-NEXT:     cmovol %edi, %esi
+; CHECK-LABEL: ssubo.select.i32
+; CHECK:       cmpl   %esi, %edi
+; CHECK-NEXT:  cmovol %edi, %esi
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -438,9 +451,9 @@ entry:
 
 define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    ssubo.select.i64
-; CHECK:          cmpq   %rsi, %rdi
-; CHECK-NEXT:     cmovoq %rdi, %rsi
+; CHECK-LABEL: ssubo.select.i64
+; CHECK:       cmpq   %rsi, %rdi
+; CHECK-NEXT:  cmovoq %rdi, %rsi
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -449,9 +462,9 @@ entry:
 
 define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    usubo.select.i32
-; CHECK:          cmpl   %esi, %edi
-; CHECK-NEXT:     cmovbl %edi, %esi
+; CHECK-LABEL: usubo.select.i32
+; CHECK:       cmpl   %esi, %edi
+; CHECK-NEXT:  cmovbl %edi, %esi
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -460,9 +473,9 @@ entry:
 
 define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    usubo.select.i64
-; CHECK:          cmpq   %rsi, %rdi
-; CHECK-NEXT:     cmovbq %rdi, %rsi
+; CHECK-LABEL: usubo.select.i64
+; CHECK:       cmpq   %rsi, %rdi
+; CHECK-NEXT:  cmovbq %rdi, %rsi
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -471,9 +484,9 @@ entry:
 
 define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    smulo.select.i32
-; CHECK:          imull  %esi, %eax
-; CHECK-NEXT:     cmovol %edi, %esi
+; CHECK-LABEL: smulo.select.i32
+; CHECK:       imull  %esi, %eax
+; CHECK-NEXT:  cmovol %edi, %esi
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -482,9 +495,9 @@ entry:
 
 define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    smulo.select.i64
-; CHECK:          imulq  %rsi, %rax
-; CHECK-NEXT:     cmovoq %rdi, %rsi
+; CHECK-LABEL: smulo.select.i64
+; CHECK:       imulq  %rsi, %rax
+; CHECK-NEXT:  cmovoq %rdi, %rsi
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -493,9 +506,9 @@ entry:
 
 define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    umulo.select.i32
-; CHECK:          mull   %esi
-; CHECK-NEXT:     cmovol %edi, %esi
+; CHECK-LABEL: umulo.select.i32
+; CHECK:       mull   %esi
+; CHECK-NEXT:  cmovol %edi, %esi
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -504,9 +517,9 @@ entry:
 
 define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    umulo.select.i64
-; CHECK:          mulq   %rsi
-; CHECK-NEXT:     cmovoq %rdi, %rsi
+; CHECK-LABEL: umulo.select.i64
+; CHECK:       mulq   %rsi
+; CHECK-NEXT:  cmovoq %rdi, %rsi
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -519,9 +532,9 @@ entry:
 ;
 define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    saddo.br.i32
-; CHECK:          addl   %esi, %edi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: saddo.br.i32
+; CHECK:       addl   %esi, %edi
+; CHECK-NEXT:  jo
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -536,9 +549,9 @@ continue:
 
 define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    saddo.br.i64
-; CHECK:          addq   %rsi, %rdi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: saddo.br.i64
+; CHECK:       addq   %rsi, %rdi
+; CHECK-NEXT:  jo
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -553,9 +566,9 @@ continue:
 
 define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    uaddo.br.i32
-; CHECK:          addl   %esi, %edi
-; CHECK-NEXT:     jb
+; CHECK-LABEL: uaddo.br.i32
+; CHECK:       addl   %esi, %edi
+; CHECK-NEXT:  jb
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -570,9 +583,9 @@ continue:
 
 define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    uaddo.br.i64
-; CHECK:          addq   %rsi, %rdi
-; CHECK-NEXT:     jb
+; CHECK-LABEL: uaddo.br.i64
+; CHECK:       addq   %rsi, %rdi
+; CHECK-NEXT:  jb
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -587,9 +600,9 @@ continue:
 
 define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    ssubo.br.i32
-; CHECK:          cmpl   %esi, %edi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: ssubo.br.i32
+; CHECK:       cmpl   %esi, %edi
+; CHECK-NEXT:  jo
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -604,9 +617,9 @@ continue:
 
 define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    ssubo.br.i64
-; CHECK:          cmpq   %rsi, %rdi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: ssubo.br.i64
+; CHECK:       cmpq   %rsi, %rdi
+; CHECK-NEXT:  jo
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -621,9 +634,9 @@ continue:
 
 define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    usubo.br.i32
-; CHECK:          cmpl   %esi, %edi
-; CHECK-NEXT:     jb
+; CHECK-LABEL: usubo.br.i32
+; CHECK:       cmpl   %esi, %edi
+; CHECK-NEXT:  jb
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -638,9 +651,9 @@ continue:
 
 define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    usubo.br.i64
-; CHECK:          cmpq   %rsi, %rdi
-; CHECK-NEXT:     jb
+; CHECK-LABEL: usubo.br.i64
+; CHECK:       cmpq   %rsi, %rdi
+; CHECK-NEXT:  jb
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -655,9 +668,9 @@ continue:
 
 define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    smulo.br.i32
-; CHECK:          imull  %esi, %edi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: smulo.br.i32
+; CHECK:       imull  %esi, %edi
+; CHECK-NEXT:  jo
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -672,9 +685,9 @@ continue:
 
 define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    smulo.br.i64
-; CHECK:          imulq  %rsi, %rdi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: smulo.br.i64
+; CHECK:       imulq  %rsi, %rdi
+; CHECK-NEXT:  jo
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -689,9 +702,9 @@ continue:
 
 define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    umulo.br.i32
-; CHECK:          mull  %esi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: umulo.br.i32
+; CHECK:       mull  %esi
+; CHECK-NEXT:  jo
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -706,9 +719,9 @@ continue:
 
 define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    umulo.br.i64
-; CHECK:          mulq  %rsi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: umulo.br.i64
+; CHECK:       mulq  %rsi
+; CHECK-NEXT:  jo
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -725,6 +738,8 @@ declare {i8,  i1} @llvm.sadd.with.overflow.i8 (i8,  i8 ) nounwind readnone
 declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone
 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i8,  i1} @llvm.uadd.with.overflow.i8 (i8,  i8 ) nounwind readnone
+declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) nounwind readnone
 declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
 declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/XCore/atomic.ll b/test/CodeGen/XCore/atomic.ll
index 58ef38b..6ca80cf 100644
--- a/test/CodeGen/XCore/atomic.ll
+++ b/test/CodeGen/XCore/atomic.ll
@@ -22,11 +22,10 @@ entry:
 ; CHECK-LABEL: atomicloadstore
 
 ; CHECK: ldw r[[R0:[0-9]+]], dp[pool]
-; CHECK-NEXT: #MEMBARRIER
-  %0 = load atomic i32* bitcast (i64* @pool to i32*) acquire, align 4
-
 ; CHECK-NEXT: ldaw r[[R1:[0-9]+]], dp[pool]
+; CHECK-NEXT: #MEMBARRIER
 ; CHECK-NEXT: ldc r[[R2:[0-9]+]], 0
+  %0 = load atomic i32* bitcast (i64* @pool to i32*) acquire, align 4
 
 ; CHECK-NEXT: ld16s r3, r[[R1]][r[[R2]]]
 ; CHECK-NEXT: #MEMBARRIER
diff --git a/test/CodeGen/XCore/dwarf_debug.ll b/test/CodeGen/XCore/dwarf_debug.ll
index 2f4b231..47db82d 100644
--- a/test/CodeGen/XCore/dwarf_debug.ll
+++ b/test/CodeGen/XCore/dwarf_debug.ll
@@ -13,27 +13,27 @@ define i32 @f(i32 %a) {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !11), !dbg !12
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !11, metadata !{metadata !"0x102"}), !dbg !12
   %0 = load i32* %a.addr, align 4, !dbg !12
   %add = add nsw i32 %0, 1, !dbg !12
   ret i32 %add, !dbg !12
 }
 
-declare void @llvm.dbg.declare(metadata, metadata)
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10}
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1}
+!0 = metadata !{metadata !"0x11\0012\00\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @f, null, null, metadata !2, i32 2}
-!5 = metadata !{i32 786473, metadata !1}
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null}
+!4 = metadata !{metadata !"0x2e\00f\00f\00\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ]
+!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!11 = metadata !{i32 786689, metadata !4, metadata !"a", metadata !5, i32 16777218, metadata !8, i32 0, i32 0}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!11 = metadata !{metadata !"0x101\00a\0016777218\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ]
 !12 = metadata !{i32 2, i32 0, metadata !4, null}
 
diff --git a/test/CodeGen/XCore/exception.ll b/test/CodeGen/XCore/exception.ll
index 3179fcd..fec83eb 100644
--- a/test/CodeGen/XCore/exception.ll
+++ b/test/CodeGen/XCore/exception.ll
@@ -107,17 +107,12 @@ Exit:
 ; CHECK: .asciiz
 ; CHECK: .byte  3
 ; CHECK: .byte  26
-; CHECK: [[SET0:.L[a-zA-Z0-9_]+]] = [[PRE_G]]-[[START]]
-; CHECK: .long [[SET0]]
-; CHECK: [[SET1:.L[a-zA-Z0-9_]+]] = [[POST_G]]-[[PRE_G]]
-; CHECK: .long [[SET1]]
-; CHECK: [[SET2:.L[a-zA-Z0-9_]+]] = [[LANDING]]-[[START]]
-; CHECK: .long [[SET2]]
+; CHECK: .long [[PRE_G]]-[[START]]
+; CHECK: .long [[POST_G]]-[[PRE_G]]
+; CHECK: .long [[LANDING]]-[[START]]
 ; CHECK: .byte 3
-; CHECK: [[SET3:.L[a-zA-Z0-9_]+]] = [[POST_G]]-[[START]]
-; CHECK: .long [[SET3]]
-; CHECK: [[SET4:.L[a-zA-Z0-9_]+]] = [[END]]-[[POST_G]]
-; CHECK: .long [[SET4]]
+; CHECK: .long [[POST_G]]-[[START]]
+; CHECK: .long [[END]]-[[POST_G]]
 ; CHECK: .long 0
 ; CHECK: .byte 0
 ; CHECK: .byte 1
author	Stephen Hines <srhines@google.com>	2014-12-01 14:51:49 -0800
committer	Stephen Hines <srhines@google.com>	2014-12-02 16:08:10 -0800
commit	37ed9c199ca639565f6ce88105f9e39e898d82d0 (patch)
tree	8fb36d3910e3ee4c4e1b7422f4f017108efc52f5 /test/CodeGen
parent	d2327b22152ced7bc46dc629fc908959e8a52d03 (diff)
download	external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.zip external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.gz external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.bz2